Web Crawler Problem

jarrad · September 14, 2014, 6:10am

Hi everyone!

i have a serious problem with my website crawler/downloader.

first… my crawler is here: http://www.websiteharvester.com/

when i enter a url in the text box… it does not get all the contents specifically images… for example i enter “http://test.com” and it retrieves only 8 images from the url and not the other 60 but it gets all web pages js and css files.

when i enter “http://test.com/images/”> it grabs all of the images…

i suck at explaining php stuff so please forgive me if i did not give enough info… i also tried posting on stackoverflow they were quite rude.

if its any help this is the code from download.php

[php]<?php
session_start();
$action = $_POST[‘action’];

switch($action){
case ‘cleanup’ :
$dir = getcwd()."/requests/";
foreach(glob($dir . ‘/*’) as $file) {
if(is_dir($file)) {
system("rm -rf ".escapeshellarg($dir));
} else if (time()-filemtime($file) > 1800) {
unlink($file);
}
}
break;
case ‘downloadrows’ :
$rows = $POST[‘rowsToDownload’];
$curpath = getcwd()."/";
$mainFolderPath = $curpath.“requests/”.uniqid('scriptharvest’);
$followDirStructure = $_POST[‘followdirstructure’];
$returnData = array();
$errors = array();

    foreach ($rows as &$dfile) {
        $fileDestPath = $mainFolderPath."/";
        if($followDirStructure == "true") {
            $fileDestPath .= $dfile["filepath"]."/";
       }

       if(!is_dir($fileDestPath)){
         if(!mkdir($fileDestPath , 0777, true)) 
          { 
           array_push($errors, "Failed to create path ".$fileDestPath);
           break;
          }  
        }

        $content = file_get_contents($dfile["url"], false, stream_context_create(array( 'http' => array('ignore_errors' => true))));
        if (isset($content) && $content != null) {
            if(!file_put_contents($fileDestPath.$dfile["saveas"], $content)) {
                array_push($errors, "Failed to save file from this URL : ".$dfile["url"]);
            }
        } else {
            array_push($errors, "This URL does not exists, got a 404 error : ".$dfile["url"]);
        }
    }

    $zip = $mainFolderPath.".zip";
    zipFolder($mainFolderPath."/", $zip);
    rrmdir($mainFolderPath."/");
    $returnData["zipfull"] = $zip;
    $returnData["zip"] = str_replace($curpath, "", $zip);
    $returnData["destinationfull"] = $mainFolderPath."/";
    $returnData["destination"] = str_replace($curpath, "", $mainFolderPath)."/";
    $returnData["errors"] = $errors;
    $returnData["requestcount"] = sizeof($rows);

    if(sizeof($rows) ==1 && $followDirStructure != "true") {
        $returnData["singlefiledownload"] = $returnData["destination"].$rows[0]["filename"];
    }

    echo json_encode($returnData);
    break;

}

function ZipFolder($source, $destination)
{
if (!extension_loaded(‘zip’) || !file_exists($source)) {
return false;
}

$zip = new ZipArchive();
if (!$zip->open($destination, ZIPARCHIVE::CREATE)) {
    return false;
}

$source = str_replace('\\', '/', realpath($source));

if (is_dir($source) === true)
{
    $files = new RecursiveIteratorIterator(new RecursiveDirectoryIterator($source), RecursiveIteratorIterator::SELF_FIRST);

    foreach ($files as $file)
    {
        $file = str_replace('\\', '/', $file);

        // Ignore "." and ".." folders
        if( in_array(substr($file, strrpos($file, '/')+1), array('.', '..')) )
            continue;

        $file = realpath($file);

        if (is_dir($file) === true)
        {
            $zip->addEmptyDir(str_replace($source . '/', '', $file . '/'));
        }
        else if (is_file($file) === true)
        {
            $zip->addFromString(str_replace($source . '/', '', $file), file_get_contents($file));
        }
    }
}
else if (is_file($source) === true)
{
    $zip->addFromString(basename($source), file_get_contents($source));
}

return $zip->close();

}

function rrmdir($dir) {
system("rm -rf ".escapeshellarg($dir));
}

?>[/php]

and this is the code from harvest.php

[php]<?php
// It may take a whils to crawl a site …
set_time_limit(10000);

// Include the phpcrawl-mainclass
include(“libs/PHPCrawler.class.php”);

//variables
$theurl = $_GET[“theurl”];
$follow_mode = 3;
$_SESSION[‘links’] = array();
$requestid = str_replace(".", “”, uniqid("", true));

// Extend the class and override the handleDocumentInfo()-method
class MyCrawler extends PHPCrawler
{
function handleDocumentInfo($DocInfo)
{
switch($DocInfo->http_status_code){
case 200 : $http_status = “200”; break;
case 401 : $http_status = “401”; break;
case 404 : $http_status = “404”; break;
case 301 : $http_status = “301”; break;
default: $http_status = “”.($DocInfo->http_status_code == ‘’ ? “??” : $DocInfo->http_status_code).""; break;
}

$content_type = $DocInfo->content_type;
if ($content_type == ‘’) {
$content_type = “text/html”;
}

echo "<tr>".
        "<td align=\"center\">"."<input type=\"checkbox\" name=\"$DocInfo->file\" value=\"$DocInfo->path\" ".($DocInfo->http_status_code==200 ? "" : "disabled")."></td>".
		"<td>$http_status</td>".
		"<td>$content_type</td>".
		"<td><a href=\"$DocInfo->url\">$DocInfo->url</a></td>".
	"</tr>";
	
	if (isset($_SESSION['links']))
	   $links = $_SESSION['links'];
	else
	   $links = array();
	array_push($links, $DocInfo);
	$_SESSION['links'] = $links;
flush();

}
}

// Now, create a instance of your class, define the behaviour
// of the crawler (see class-reference for more options and details)
// and start the crawling-process.

$crawler = new MyCrawler();

// URL to crawl
$crawler->setURL($theurl);

//setting the follow mode
$crawler->setFollowMode($follow_mode);
$crawler->enableAggressiveLinkSearch(true);

$crawler->addContentTypeReceiveRule("#text/js#");
$crawler->addContentTypeReceiveRule("#text/html#");
$crawler->addContentTypeReceiveRule("#text/css#");

// Store and send cookie-data like a browser does
$crawler->enableCookieHandling(true);

// Set the traffic-limit to 1 MB (in bytes,
// for testing we dont want to “suck” the whole site)
$crawler->setTrafficLimit(5000 * 1024);
$crawler->setPageLimit(5000, false);

?>

Download selected

Download Selected Files (To A Single Folder)
Download All Files (To A Single Folder)
Download Selected Files (Follow folder structure)
Download All Files (Follow folder structure)
Buy this App!!

<div class="btn-toolbar pull-left">
  	<label class="checkbox-inline">
	  <input type="checkbox" class="choiceCheckbox" value="html" checked> HTML
	</label>
	<label class="checkbox-inline">
	  <input type="checkbox" class="choiceCheckbox" value="image" checked> Images
	</label>
	<label class="checkbox-inline">
	  <input type="checkbox" class="choiceCheckbox" value="css" checked> CSS
	</label>
	<label class="checkbox-inline">
	  <input type="checkbox" class="choiceCheckbox" value="javascript" checked> JS
	</label>
</div>

<input type="hidden" id="uniqueid" value="<?php echo $requestid; ?>">

<?php try { $crawler->goMultiProcessed(8); } catch (Exception $e) { $crawler->go(); } ?>

Select	Status	Type	URL

[/php]

and finally the code from harvester.php

[php]<?php

$theurl = $_GET[‘url’];
$htmlExts = array(“htm”, “html”, “php”, “asp”, “aspx”, “cfm”);
$jsExts = array(“js”, “json”);
$cssExts = array(“css”);
$imageExts = array(“png”, “jpeg”, “bmp”, “gif”, “psd”, “ico”, “jpg”);
ini_set(‘max_execution_time’, 60);

//harvestSite($theurl);
echo json_encode(harvestSite($theurl));

function harvestSite($url) {
$mainLinks = harvestLink($url);
$maxlinks = 10;
//scheme, host, url, filename, dirname, basename, extension, path

$morelinks = array_filter($mainLinks, "cssLinks");
$morelinks = array_slice($morelinks, 0, $maxlinks);
$m = sizeof($morelinks);
$maxlinks -= $m;
for($i=0; $i < $m; $i++) {
	$found = harvestLink($morelinks[$i]['url']);
	$mainLinks = array_unique(array_merge($mainLinks, $found), SORT_REGULAR);
}
return $mainLinks;

}

function cssLinks ($link) {
global $cssExts;
return in_array($link[‘extension’], $cssExts) && $link[‘isinternal’] == true;
}

function htmlLinks ($link) {
global $htmlExts;
return in_array($link[‘extension’], $htmlExts) && $link[‘isinternal’] == true;
}

function harvestLink ($url) {
if(validateURL($url) == false) {
return null;
}
global $htmlExts, $cssExts, $jsExts, $imageExts;
$urlDet = getURLStructure($url, $url);
$urlExt = $urlDet[‘extension’] != null ? $urlDet[‘extension’] : ‘html’;
$urlExt = strtolower($urlExt);
$links = array();
array_push($links, getURLStructure($url, $url));

if (in_array($urlExt, $imageExts)) {
	return $links;
} 

if (in_array($urlExt, $jsExts)) {
	$jslinks = harvestFromJSContent($url);
	if ($jslinks != null) {
		$links = array_unique(array_merge($links, $jslinks), SORT_REGULAR);
	}
} else if (in_array($urlExt, $cssExts)) {
	$csslinks = harvestFromCSSContent($url);
	if ($csslinks != null) {
		$links = array_unique(array_merge($links, $csslinks), SORT_REGULAR);
	}
} else if (in_array($urlExt, $htmlExts)) {
	$htmllinks = harvestFromHTMLContent($url);
	if ($htmllinks != null) {
		$links = array_unique(array_merge($links, $htmllinks), SORT_REGULAR);
	}
}
return $links;

}

function harvestFromHTMLContent ($url, $parenturl="") {
$text = file_get_contents($url, false, stream_context_create(array( ‘http’ => array(‘ignore_errors’ => true))));
if(!isset($text) || $text == “”) {
return null;
}
$dom = new DOMDocument();
@$dom->loadHTML($text);
$xpath = new DOMXPath($dom);
$linksArray = array();
$nodesSelector = “(//@href | //@src)[not(.=preceding:://@src)][not(.=preceding:://@href)]”;
if($parenturl != “”) {
$nodesSelector .= ‘[starts-with(self::node(), "’.$parenturl.’")]’;
}

$Links = $xpath->evaluate($nodesSelector);
for ($i = 0; $i < $Links->length; $i++) {
       $Link = $Links->item($i)->value;
       $Link = getURLStructure($Link, $url);
       array_push($linksArray, $Link);
}
return $linksArray;

}

function harvestFromCSSContent($url) {
$text = file_get_contents($url, false, stream_context_create(array( ‘http’ => array(‘ignore_errors’ => true))));
if(!isset($text) || $text == “”) {
return null;
}
$urls = array( );

$url_pattern     = '(([^\\\\\'", \(\)]*(\\\\.)?)+)';
$urlfunc_pattern = 'url\(\s*[\'"]?' . $url_pattern . '[\'"]?\s*\)';
$pattern         = '/(' .
     '(@import\s*[\'"]' . $url_pattern     . '[\'"])' .
    '|(@import\s*'      . $urlfunc_pattern . ')'      .
    '|('                . $urlfunc_pattern . ')'      .  ')/iu';
if ( !preg_match_all( $pattern, $text, $matches ) )
    return $urls;

// @import '...'
// @import "..."
foreach ( $matches[3] as $match ) {
    if ( !empty($match) ) 
        array_push($urls, getURLStructure(preg_replace( '/\\\\(.)/u', '\\1', $match ), $url));
}


foreach ( $matches[7] as $match ) {
    if ( !empty($match) )
        array_push($urls, getURLStructure(preg_replace( '/\\\\(.)/u', '\\1', $match ), $url));  
}

// url(...)
// url('...')
// url("...")
foreach ( $matches[11] as $match ) {
    if ( !empty($match) )
    	array_push($urls, getURLStructure(preg_replace( '/\\\\(.)/u', '\\1', $match ), $url));
}

return $urls;

}

function getURLStructure($url, $parenturl) {
$ret = parse_url($url);
$parentParse = parse_url($parenturl);
if (isset($parentParse[‘path’])) {
$parentParse = array_merge($parentParse, pathinfo($parentParse[‘path’]));
}

$t = $url;
global $htmlExts;

if(!isset($ret['host']) && $parenturl != "") {
	$t = relativeToAbsoluteURL($parenturl, $url);
	$ret = parse_url($t);
}
$ret['url'] = $t;
$ret['isinternal'] = true;
if (isset($ret['host'])) {
	$ret['isinternal'] = (ltrim($ret['host'],"www.") == ltrim($parentParse['host'],"www.") ? true : false);
}
if (!isset($ret['extension'])) {
	$ret['extension'] = "html";
}
if(isset($ret['path'])) {
	$ret = array_merge($ret, pathinfo($ret['path']));
}
return $ret;

}

function validateURL ($url) {
return true;
}

//copied from http://toolspot.org/relative-path-into-absolute-url.php
function relativeToAbsoluteURL($base, $rel)
{
if(strpos($rel,"//")===0)
{
return “http:”.$rel;
}
/* return if already absolute URL /
if (parse_url($rel, PHP_URL_SCHEME) != ‘’) return $rel;
/ queries and anchors /
if ($rel[0]==’#’ || $rel[0]==’?’) return $base.$rel;
/ parse base URL and convert to local variables:
$scheme, $host, $path /
extract(parse_url($base));
/ remove non-directory element from path /
$path = preg_replace(’#/[^/]$#’, ‘’, $path);
/* destroy path if relative url points to root /
if ($rel[0] == ‘/’) $path = ‘’;
/ dirty absolute URL /
$abs = “$host$path/$rel”;
/ replace ‘//’ or ‘/./’ or ‘/foo/…/’ with ‘/’ /
$re = array(’#(/.?/)#’, ‘#/(?!..)[^/]+/…/#’);
for($n=1; $n>0; $abs=preg_replace($re, ‘/’, $abs, -1, $n)) {}
/ absolute URL is ready! */
return $scheme.’://’.$abs;
}

?>[/php]

if some one could help me fix this i would be very grateful! ;D

dont forget to post any other required info you might need as i have not provided much at all

grwebugy · September 14, 2014, 9:21pm

Did they give you any clues tho?

You may be running into permissions, script timeouts, oddly-named files that through errors in your script.

You can start by including try/catch errors with each function, and trying to find out where it is stopping, unless you know that already.