When I go to crawler.php and there are no websites to be crawled it empties the table words.
Crawler.php:
[php]<?php
require_once(’_config.php’);
require_once(’_db.php’);
require_once(’_crawler.php’);
set_time_limit (0);
error_reporting (E_ERROR | E_WARNING | E_PARSE);
$crawl_max_shown_depth = $CRAWL_MAX_DEPTH - 1;
print “Crawler Status: Running
\n”;
print “Log format: Crawling: [Current depth ({$crawl_max_shown_depth} MAX)] URL Action
\n”;
if ($CRAWL_DB_DISABLE_KEYS) sql_query("/*!40000 ALTER TABLE phpcrawler_links
DISABLE KEYS */;");
addHeadLink(1, $CRAWL_ENTRY_POINT_URL);
markOldURLsToCrawl();
$url_counter = 0;
$url_size = 0;
while($URL_info = getURLToCrawl()) {
// Cooldown
usleep ($CRAWL_THREAD_SLEEP_TIME);
$url_counter++;
$URL = $URL_info[“url”];
$site_URL = $CRAWL_ENTRY_POINT_URL;
//$site_URL = $URL_info[“site_url”];
//$current_URL = preg_replace("//[^/]+$/i", “”, $URL_info[“url”]);
$current_URL = preg_replace("/([^/])/[^/]+$/i", “\1”, $URL_info[“url”]);
//print(" base: " . $current_URL . " ");
print “Crawling: [” . $URL_info[“depth”] . “] {$URL}”;
$page = fetchURL($URL);
if ($page === false) {
dropURLFromDB($URL_info[“id”]);
print " - FAILED/REMOVED.
\n";
continue;
}
$page_size = strlen($page);
$url_size += $page_size;
print " " . ($page_size / 1000) . “kb”;
$page_content = preparePage($page);
$page_content_md5 = md5($page_content);
$page_hash = prepareHash($page_content); // puts words into DB; returns number of words
$page_hash_md5 = md5($page_hash);
if($page_counter = checkEquals($page_content_md5)) {
unsetURLFromDB($URL_info[“id”]);
print " - SKIPPED ({$page_counter} equals).
\n";
continue;
}
$URLs_draft = getURLsFromPage($page, $URL_info[“depth”] + 1); //array
$page_title = getPageTitle($page);
$URLs_clean = filterURLs($URLs_draft, $site_URL, $current_URL); //$base_URL, $current_URL
$URLs_to_crawl = addURLsToCrawl($URL_info[“site_id”], $URLs_clean, $URL_info[“depth”] + 1);
print " +" . $URLs_to_crawl . " urls.
\n";
// send_page_to_db($URL_info[“id”], $page_title, $page_content, $page_content_md5);
sendPageToDB($URL_info[“id”], $page_title, $page_hash, $page_hash_md5);
}
if ($CRAWL_DB_DISABLE_KEYS) sql_query("/*!40000 ALTER TABLE phpcrawler_links
ENABLE KEYS */;");
print $url_counter . " Pages Crawled, " . ($url_size/1000) . “KB Processed.
\n”;
?>[/php]