diff --git a/cron/weekly/checkent b/cron/weekly/checkent index d434633..fa69d5b 100755 --- a/cron/weekly/checkent +++ b/cron/weekly/checkent @@ -3,6 +3,6 @@ . `dirname $0`/../../build-ops cd ${SCRIPTSDIR} -${PHP} checkent.php phpdoc > ${DOCWEB}/www/checkent_php.php -${PHP} checkent.php peardoc > ${DOCWEB}/www/checkent_pear.php -${PHP} checkent.php smarty > ${DOCWEB}/www/checkent_smarty.php +${PHP} checkent.php phpdoc > /dev/null +${PHP} checkent.php peardoc > /dev/null +${PHP} checkent.php smarty > /dev/null diff --git a/scripts/checkent.php b/scripts/checkent.php index dba87a0..dd98fc9 100755 --- a/scripts/checkent.php +++ b/scripts/checkent.php @@ -1,43 +1,46 @@ | - * | Gabor Hojsty | - * | Docweb port: Nuno Lopes | - * | Mehdi Achour | - * +----------------------------------------------------------------------+ - * $Id$ - */ +/* vim: set expandtab tabstop=4 shiftwidth=4 softtabstop=4: ++----------------------------------------------------------------------+ +| PHP Documentation Site Source Code | ++----------------------------------------------------------------------+ +| Copyright (c) 1997-2005 The PHP Group | ++----------------------------------------------------------------------+ +| This source file is subject to version 3.0 of the PHP license, | +| that is bundled with this package in the file LICENSE, and is | +| available at through the world-wide-web at | +| http://www.php.net/license/3_0.txt. | +| If you did not receive a copy of the PHP license and are unable to | +| obtain it through the world-wide-web, please send a note to | +| license@php.net so we can mail you a copy immediately. | ++----------------------------------------------------------------------+ +| Authors: Georg Richter | +| Gabor Hojsty | +| Docweb port: Nuno Lopes | +| Mehdi Achour | +| Sean Coates | ++----------------------------------------------------------------------+ +$Id$ +*/ set_time_limit(0); $inCli = true; -include '../include/init.inc.php'; - -define('DOCWEB_CRAWLER_USER_AGENT', 'DocWeb Link Crawler (http://doc.php.net)'); +require_once '../include/init.inc.php'; +require_once '../include/lib_url_entities.inc.php'; switch (isset($_SERVER['argv'][1]) ? $_SERVER['argv'][1] : false) { case 'phpdoc': $filename = CVS_DIR . '/phpdoc-all/entities/global.ent'; + $entType = 'php'; break; case 'peardoc': $filename = CVS_DIR . '/peardoc/global.ent'; + $entType = 'pear'; break; case 'smarty': $filename = CVS_DIR . '/smarty/docs/entities/global.ent'; + $entType = 'smarty'; break; default: @@ -56,157 +59,38 @@ if (extension_loaded('openssl')) { $schemes[] = 'https'; } -// constants for errors -define('UNKNOWN_HOST', 1); -define('FTP_CONNECT', 2); -define('FTP_LOGIN', 3); -define('FTP_NO_FILE', 4); -define('HTTP_CONNECT', 5); -define('HTTP_MOVED', 6); -define('HTTP_WRONG_HEADER', 7); -define('HTTP_INTERNAL_ERROR', 8); -define('HTTP_NOT_FOUND', 9); - - -/** - * Handles relative HTTP URLs - * - * @param string $url URL to handle - * @param array $parsed result of parse_url() - * @return string fixed URL - */ -function fix_relative_url ($url, $parsed) -{ - if ($url{0} == '/') { - return "{$parsed['scheme']}://{$parsed['host']}{$url}"; - } - - if (preg_match('@(?:f|ht)tps?://@S', $url)) { - return $url; - } - - /* try to be RFC 1808 compliant */ - $path = $parsed['path'] . $url; - $old = ''; - - do { - $old = $path; - $path = preg_replace('@[^/:?]+/\.\./|\./@S', '', $path); - } while ($old != $path); - - return "{$parsed['scheme']}://{$parsed['host']}{$path}"; +$dbFile = SQLITE_DIR . "checkent_{$entType}.sqlite"; +if (is_file($dbFile) && !unlink($dbFile)) { + echo "Error removing old database.\n"; + die(); } - -/** - * Checks a URL (actually fetches the URL and returns the status) - * - * @param int $num sequence number of URL - * @param string $entity_url URL to check - * @return array - */ -function check_url ($num, $entity_url) -{ - static $old_host = ''; - - // Get the parts of the URL - $url = parse_url($entity_url); - $entity = $GLOBALS['entity_names'][$num]; - - // sleep if accessing the same host more that once in a row - if ($url['host'] == $old_host) { - sleep(5); - } else { - $old_host = $url['host']; - } - - // Try to find host - if (gethostbyname($url['host']) == $url['host']) { - return array(UNKNOWN_HOST, array($num)); - } - - switch($url['scheme']) { - - case 'http': - case 'https': - if (isset($url['path'])) { - $url['path'] = $url['path'] . (isset($url['query']) ? '?' . $url['query'] : ''); - } else { - $url['path'] = '/'; - } - - /* check if using secure http */ - if ($url['scheme'] == 'https') { - $port = 443; - $scheme = 'ssl://'; - } else { - $port = 80; - $scheme = ''; - } - $port = isset($url['port']) ? $url['port'] : $port; - - if (!$fp = @fsockopen($scheme . $url['host'], $port)) { - return array(HTTP_CONNECT, array($num)); - - } else { - fputs($fp, "HEAD {$url['path']} HTTP/1.0\r\nHost: {$url['host']}\r\nUser-agent: ". DOCWEB_CRAWLER_USER_AGENT ."\r\nConnection: close\r\n\r\n"); - - $str = ''; - while (!feof($fp)) { - $str .= @fgets($fp, 2048); - } - fclose ($fp); - - if (preg_match('@HTTP/1.\d (\d+)(?: .+)?@S', $str, $match)) { - if ($match[1] != '200') { - switch ($match[1]) - { - case '500' : - case '501' : - return array(HTTP_INTERNAL_ERROR, array($num)); - break; - - case '404' : - return array(HTTP_NOT_FOUND, array($num)); - break; - - case '301' : - case '302' : - if (preg_match('/Location: (.+)/', $str, $redir)) { - return array(HTTP_MOVED, array($num, fix_relative_url($redir[1], $url))); - } else { - return array(HTTP_WRONG_HEADER, array($num, $str)); - } - break; - - default : - return array(HTTP_WRONG_HEADER, array($num, $str)); - } - } // error != 200 - } else { - return array(HTTP_WRONG_HEADER, array($num, $str)); - } - } - break; - - case 'ftp': - if ($ftp = @ftp_connect($url['host'])) { - - if (@ftp_login($ftp, 'anonymous', 'IEUser@')) { - $flist = ftp_nlist($ftp, $url['path']); - if (!count($flist)) { - return array(FTP_NO_FILE, array($num)); - } - } else { - return array(FTP_LOGIN, array($num)); - } - @ftp_quit($ftp); - } else { - return array(FTP_CONNECT, array($num)); - } - break; - } +if (!($sqlite = sqlite_open($dbFile, 0666))) { + echo "Error creating database.\n"; } - +$sqlCreateMeta = " + CREATE + TABLE + meta_info + ( + start_time DATETIME, + end_time DATETIME, + schemes VARCHAR(100) + ); +"; +$sqlCreateChecked = " + CREATE + TABLE + checked_urls + ( + url_num INT, + entity VARCHAR(255), + url VARCHAR(255), + check_result INT, + return_val VARCHAR(255) + ); +"; +sqlite_query($sqlite, $sqlCreateMeta); +sqlite_query($sqlite, $sqlCreateChecked); if (!$file = @file_get_contents($filename)) { // ouput the html @@ -230,14 +114,48 @@ $entity_urls = $entities_found[3]; $errors = array(); $numb = 0; +$sql = " + INSERT + INTO + meta_info (start_time, end_time, schemes) + VALUES + (". time() .", NULL, '". sqlite_escape_string(implode(',', $schemes)) ."') +"; +sqlite_query($sqlite, $sql); + +echo "Found: ". count($entity_urls) ."URLs\n"; + // Walk through entities found foreach ($entity_urls as $num => $entity_url) { ++$numb; + echo "Checking: $entity_url\n"; $err = check_url($num, $entity_url); $errors[$err[0]][] = $err[1]; + $return_val = isset($err[1][1]) ? $err[1][1] : ''; + $sql = " + INSERT + INTO + checked_urls (url_num, entity, url, check_result, return_val) + VALUES + ( + $num, + '". sqlite_escape_string($entity_names[$num]) ."', + '". sqlite_escape_string($entity_url) ."', + {$err[0]}, + '". sqlite_escape_string($return_val) ."' + ) + "; + sqlite_query($sqlite, $sql); } +$sql = " + UPDATE + meta_info + SET + end_time = ". time() ." +"; +sqlite_query($sqlite, $sql); // ouput the html echo " | +| Mehdi Achour | +| Sean Coates | ++----------------------------------------------------------------------+ +$Id$ +*/ -include '../include/init.inc.php'; +require_once '../include/init.inc.php'; +require_once '../include/lib_url_entities.inc.php'; -if(is_file('checkent_' . SITE . '.php')) { - include 'checkent_' . SITE . '.php'; -} else { +$dbFile = SQLITE_DIR . "checkent_" . SITE . ".sqlite"; +if (!($sqlite = @sqlite_open($dbFile))) { echo site_header('docweb.common.header.checkent'); - echo '

checkent not found!

'; + echo '

checkent not found!

'; // @@@ template this echo site_footer(); + exit(); } +$sql = " + SELECT + start_time, end_time, schemes + FROM + meta_info +"; +list($startTime, $endTime, $schemes) = sqlite_fetch_array(sqlite_query($sqlite, $sql)); + +$entData = array(); +$sql = " + SELECT + url_num, entity, url, check_result, return_val + FROM + checked_urls + WHERE + check_result > 0 + ORDER BY + check_result, entity +"; +$urlsQ = sqlite_query($sqlite, $sql); +while ($row = sqlite_fetch_array($urlsQ)) { + $entData[$row['check_result']][] = $row; +} + + +echo site_header('docweb.common.header.checkent'); +echo DocWeb_Template::get( + 'checkent.tpl.php', + array( + 'startTime' => $startTime, + 'isComplete' => $endTime ? TRUE : FALSE, + 'schemes' => $schemes, + 'entData' => $entData, + 'resultLkp' => $urlResultLookup, + 'extraCol' => $urlResultExtraCol, + ) + ); +echo site_footer(); + ?>