massive re-working of the checkent script

This commit is contained in:
Sean Coates
2005-01-22 06:56:59 +00:00
parent 59aa900f47
commit 50af6149d0
3 changed files with 159 additions and 181 deletions

View File

@ -3,6 +3,6 @@
. `dirname $0`/../../build-ops . `dirname $0`/../../build-ops
cd ${SCRIPTSDIR} cd ${SCRIPTSDIR}
${PHP} checkent.php phpdoc > ${DOCWEB}/www/checkent_php.php ${PHP} checkent.php phpdoc > /dev/null
${PHP} checkent.php peardoc > ${DOCWEB}/www/checkent_pear.php ${PHP} checkent.php peardoc > /dev/null
${PHP} checkent.php smarty > ${DOCWEB}/www/checkent_smarty.php ${PHP} checkent.php smarty > /dev/null

View File

@ -1,43 +1,46 @@
<?php <?php
/** /* vim: set expandtab tabstop=4 shiftwidth=4 softtabstop=4:
* +----------------------------------------------------------------------+ +----------------------------------------------------------------------+
* | PHP Documentation Site Source Code | | PHP Documentation Site Source Code |
* +----------------------------------------------------------------------+ +----------------------------------------------------------------------+
* | Copyright (c) 1997-2005 The PHP Group | | Copyright (c) 1997-2005 The PHP Group |
* +----------------------------------------------------------------------+ +----------------------------------------------------------------------+
* | This source file is subject to version 3.0 of the PHP license, | | This source file is subject to version 3.0 of the PHP license, |
* | that is bundled with this package in the file LICENSE, and is | | that is bundled with this package in the file LICENSE, and is |
* | available at through the world-wide-web at | | available at through the world-wide-web at |
* | http://www.php.net/license/3_0.txt. | | http://www.php.net/license/3_0.txt. |
* | If you did not receive a copy of the PHP license and are unable to | | If you did not receive a copy of the PHP license and are unable to |
* | obtain it through the world-wide-web, please send a note to | | obtain it through the world-wide-web, please send a note to |
* | license@php.net so we can mail you a copy immediately. | | license@php.net so we can mail you a copy immediately. |
* +----------------------------------------------------------------------+ +----------------------------------------------------------------------+
* | Authors: Georg Richter <georg@php.net> | | Authors: Georg Richter <georg@php.net> |
* | Gabor Hojsty <goba@php.net> | | Gabor Hojsty <goba@php.net> |
* | Docweb port: Nuno Lopes <nlopess@php.net> | | Docweb port: Nuno Lopes <nlopess@php.net> |
* | Mehdi Achour <didou@php.net> | | Mehdi Achour <didou@php.net> |
* +----------------------------------------------------------------------+ | Sean Coates <sean@php.net> |
* $Id$ +----------------------------------------------------------------------+
*/ $Id$
*/
set_time_limit(0); set_time_limit(0);
$inCli = true; $inCli = true;
include '../include/init.inc.php'; require_once '../include/init.inc.php';
require_once '../include/lib_url_entities.inc.php';
define('DOCWEB_CRAWLER_USER_AGENT', 'DocWeb Link Crawler (http://doc.php.net)');
switch (isset($_SERVER['argv'][1]) ? $_SERVER['argv'][1] : false) { switch (isset($_SERVER['argv'][1]) ? $_SERVER['argv'][1] : false) {
case 'phpdoc': case 'phpdoc':
$filename = CVS_DIR . '/phpdoc-all/entities/global.ent'; $filename = CVS_DIR . '/phpdoc-all/entities/global.ent';
$entType = 'php';
break; break;
case 'peardoc': case 'peardoc':
$filename = CVS_DIR . '/peardoc/global.ent'; $filename = CVS_DIR . '/peardoc/global.ent';
$entType = 'pear';
break; break;
case 'smarty': case 'smarty':
$filename = CVS_DIR . '/smarty/docs/entities/global.ent'; $filename = CVS_DIR . '/smarty/docs/entities/global.ent';
$entType = 'smarty';
break; break;
default: default:
@ -56,157 +59,38 @@ if (extension_loaded('openssl')) {
$schemes[] = 'https'; $schemes[] = 'https';
} }
// constants for errors $dbFile = SQLITE_DIR . "checkent_{$entType}.sqlite";
define('UNKNOWN_HOST', 1); if (is_file($dbFile) && !unlink($dbFile)) {
define('FTP_CONNECT', 2); echo "Error removing old database.\n";
define('FTP_LOGIN', 3); die();
define('FTP_NO_FILE', 4);
define('HTTP_CONNECT', 5);
define('HTTP_MOVED', 6);
define('HTTP_WRONG_HEADER', 7);
define('HTTP_INTERNAL_ERROR', 8);
define('HTTP_NOT_FOUND', 9);
/**
* Handles relative HTTP URLs
*
* @param string $url URL to handle
* @param array $parsed result of parse_url()
* @return string fixed URL
*/
function fix_relative_url ($url, $parsed)
{
if ($url{0} == '/') {
return "{$parsed['scheme']}://{$parsed['host']}{$url}";
}
if (preg_match('@(?:f|ht)tps?://@S', $url)) {
return $url;
}
/* try to be RFC 1808 compliant */
$path = $parsed['path'] . $url;
$old = '';
do {
$old = $path;
$path = preg_replace('@[^/:?]+/\.\./|\./@S', '', $path);
} while ($old != $path);
return "{$parsed['scheme']}://{$parsed['host']}{$path}";
} }
if (!($sqlite = sqlite_open($dbFile, 0666))) {
/** echo "Error creating database.\n";
* Checks a URL (actually fetches the URL and returns the status)
*
* @param int $num sequence number of URL
* @param string $entity_url URL to check
* @return array
*/
function check_url ($num, $entity_url)
{
static $old_host = '';
// Get the parts of the URL
$url = parse_url($entity_url);
$entity = $GLOBALS['entity_names'][$num];
// sleep if accessing the same host more that once in a row
if ($url['host'] == $old_host) {
sleep(5);
} else {
$old_host = $url['host'];
}
// Try to find host
if (gethostbyname($url['host']) == $url['host']) {
return array(UNKNOWN_HOST, array($num));
}
switch($url['scheme']) {
case 'http':
case 'https':
if (isset($url['path'])) {
$url['path'] = $url['path'] . (isset($url['query']) ? '?' . $url['query'] : '');
} else {
$url['path'] = '/';
}
/* check if using secure http */
if ($url['scheme'] == 'https') {
$port = 443;
$scheme = 'ssl://';
} else {
$port = 80;
$scheme = '';
}
$port = isset($url['port']) ? $url['port'] : $port;
if (!$fp = @fsockopen($scheme . $url['host'], $port)) {
return array(HTTP_CONNECT, array($num));
} else {
fputs($fp, "HEAD {$url['path']} HTTP/1.0\r\nHost: {$url['host']}\r\nUser-agent: ". DOCWEB_CRAWLER_USER_AGENT ."\r\nConnection: close\r\n\r\n");
$str = '';
while (!feof($fp)) {
$str .= @fgets($fp, 2048);
}
fclose ($fp);
if (preg_match('@HTTP/1.\d (\d+)(?: .+)?@S', $str, $match)) {
if ($match[1] != '200') {
switch ($match[1])
{
case '500' :
case '501' :
return array(HTTP_INTERNAL_ERROR, array($num));
break;
case '404' :
return array(HTTP_NOT_FOUND, array($num));
break;
case '301' :
case '302' :
if (preg_match('/Location: (.+)/', $str, $redir)) {
return array(HTTP_MOVED, array($num, fix_relative_url($redir[1], $url)));
} else {
return array(HTTP_WRONG_HEADER, array($num, $str));
}
break;
default :
return array(HTTP_WRONG_HEADER, array($num, $str));
}
} // error != 200
} else {
return array(HTTP_WRONG_HEADER, array($num, $str));
}
}
break;
case 'ftp':
if ($ftp = @ftp_connect($url['host'])) {
if (@ftp_login($ftp, 'anonymous', 'IEUser@')) {
$flist = ftp_nlist($ftp, $url['path']);
if (!count($flist)) {
return array(FTP_NO_FILE, array($num));
}
} else {
return array(FTP_LOGIN, array($num));
}
@ftp_quit($ftp);
} else {
return array(FTP_CONNECT, array($num));
}
break;
}
} }
$sqlCreateMeta = "
CREATE
TABLE
meta_info
(
start_time DATETIME,
end_time DATETIME,
schemes VARCHAR(100)
);
";
$sqlCreateChecked = "
CREATE
TABLE
checked_urls
(
url_num INT,
entity VARCHAR(255),
url VARCHAR(255),
check_result INT,
return_val VARCHAR(255)
);
";
sqlite_query($sqlite, $sqlCreateMeta);
sqlite_query($sqlite, $sqlCreateChecked);
if (!$file = @file_get_contents($filename)) { if (!$file = @file_get_contents($filename)) {
// ouput the html // ouput the html
@ -230,14 +114,48 @@ $entity_urls = $entities_found[3];
$errors = array(); $errors = array();
$numb = 0; $numb = 0;
$sql = "
INSERT
INTO
meta_info (start_time, end_time, schemes)
VALUES
(". time() .", NULL, '". sqlite_escape_string(implode(',', $schemes)) ."')
";
sqlite_query($sqlite, $sql);
echo "Found: ". count($entity_urls) ."URLs\n";
// Walk through entities found // Walk through entities found
foreach ($entity_urls as $num => $entity_url) { foreach ($entity_urls as $num => $entity_url) {
++$numb; ++$numb;
echo "Checking: $entity_url\n";
$err = check_url($num, $entity_url); $err = check_url($num, $entity_url);
$errors[$err[0]][] = $err[1]; $errors[$err[0]][] = $err[1];
$return_val = isset($err[1][1]) ? $err[1][1] : '';
$sql = "
INSERT
INTO
checked_urls (url_num, entity, url, check_result, return_val)
VALUES
(
$num,
'". sqlite_escape_string($entity_names[$num]) ."',
'". sqlite_escape_string($entity_url) ."',
{$err[0]},
'". sqlite_escape_string($return_val) ."'
)
";
sqlite_query($sqlite, $sql);
} }
$sql = "
UPDATE
meta_info
SET
end_time = ". time() ."
";
sqlite_query($sqlite, $sql);
// ouput the html // ouput the html
echo "<?php include_once '../include/init.inc.php'; echo "<?php include_once '../include/init.inc.php';

View File

@ -1,13 +1,73 @@
<?php <?php
/* vim: set expandtab tabstop=4 shiftwidth=4 softtabstop=4:
+----------------------------------------------------------------------+
| PHP Documentation Site Source Code |
+----------------------------------------------------------------------+
| Copyright (c) 1997-2005 The PHP Group |
+----------------------------------------------------------------------+
| This source file is subject to version 3.0 of the PHP license, |
| that is bundled with this package in the file LICENSE, and is |
| available at through the world-wide-web at |
| http://www.php.net/license/3_0.txt. |
| If you did not receive a copy of the PHP license and are unable to |
| obtain it through the world-wide-web, please send a note to |
| license@php.net so we can mail you a copy immediately. |
+----------------------------------------------------------------------+
| Authors: Nuno Lopes <nlopess@php.net> |
| Mehdi Achour <didou@php.net> |
| Sean Coates <sean@php.net> |
+----------------------------------------------------------------------+
$Id$
*/
include '../include/init.inc.php'; require_once '../include/init.inc.php';
require_once '../include/lib_url_entities.inc.php';
if(is_file('checkent_' . SITE . '.php')) { $dbFile = SQLITE_DIR . "checkent_" . SITE . ".sqlite";
include 'checkent_' . SITE . '.php'; if (!($sqlite = @sqlite_open($dbFile))) {
} else {
echo site_header('docweb.common.header.checkent'); echo site_header('docweb.common.header.checkent');
echo '<p>checkent not found!</p>'; echo '<p>checkent not found!</p>'; // @@@ template this
echo site_footer(); echo site_footer();
exit();
} }
$sql = "
SELECT
start_time, end_time, schemes
FROM
meta_info
";
list($startTime, $endTime, $schemes) = sqlite_fetch_array(sqlite_query($sqlite, $sql));
$entData = array();
$sql = "
SELECT
url_num, entity, url, check_result, return_val
FROM
checked_urls
WHERE
check_result > 0
ORDER BY
check_result, entity
";
$urlsQ = sqlite_query($sqlite, $sql);
while ($row = sqlite_fetch_array($urlsQ)) {
$entData[$row['check_result']][] = $row;
}
echo site_header('docweb.common.header.checkent');
echo DocWeb_Template::get(
'checkent.tpl.php',
array(
'startTime' => $startTime,
'isComplete' => $endTime ? TRUE : FALSE,
'schemes' => $schemes,
'entData' => $entData,
'resultLkp' => $urlResultLookup,
'extraCol' => $urlResultExtraCol,
)
);
echo site_footer();
?> ?>