Hi everyone!
i have a serious problem with my website crawler/downloader.
first… my crawler is here: http://www.websiteharvester.com/
when i enter a url in the text box… it does not get all the contents specifically images… for example i enter “http://test.com” and it retrieves only 8 images from the url and not the other 60 but it gets all web pages js and css files.
when i enter “http://test.com/images/”> it grabs all of the images…
i suck at explaining php stuff so please forgive me if i did not give enough info… i also tried posting on stackoverflow they were quite rude.
if its any help this is the code from download.php
[php]<?php
session_start();
$action = $_POST[‘action’];
switch($action){
case ‘cleanup’ :
$dir = getcwd()."/requests/";
foreach(glob($dir . ‘/*’) as $file) {
if(is_dir($file)) {
system("rm -rf ".escapeshellarg($dir));
} else if (time()-filemtime($file) > 1800) {
unlink($file);
}
}
break;
case ‘downloadrows’ :
$rows = $POST[‘rowsToDownload’];
$curpath = getcwd()."/";
$mainFolderPath = $curpath.“requests/”.uniqid('scriptharvest’);
$followDirStructure = $_POST[‘followdirstructure’];
$returnData = array();
$errors = array();
foreach ($rows as &$dfile) {
$fileDestPath = $mainFolderPath."/";
if($followDirStructure == "true") {
$fileDestPath .= $dfile["filepath"]."/";
}
if(!is_dir($fileDestPath)){
if(!mkdir($fileDestPath , 0777, true))
{
array_push($errors, "Failed to create path ".$fileDestPath);
break;
}
}
$content = file_get_contents($dfile["url"], false, stream_context_create(array( 'http' => array('ignore_errors' => true))));
if (isset($content) && $content != null) {
if(!file_put_contents($fileDestPath.$dfile["saveas"], $content)) {
array_push($errors, "Failed to save file from this URL : ".$dfile["url"]);
}
} else {
array_push($errors, "This URL does not exists, got a 404 error : ".$dfile["url"]);
}
}
$zip = $mainFolderPath.".zip";
zipFolder($mainFolderPath."/", $zip);
rrmdir($mainFolderPath."/");
$returnData["zipfull"] = $zip;
$returnData["zip"] = str_replace($curpath, "", $zip);
$returnData["destinationfull"] = $mainFolderPath."/";
$returnData["destination"] = str_replace($curpath, "", $mainFolderPath)."/";
$returnData["errors"] = $errors;
$returnData["requestcount"] = sizeof($rows);
if(sizeof($rows) ==1 && $followDirStructure != "true") {
$returnData["singlefiledownload"] = $returnData["destination"].$rows[0]["filename"];
}
echo json_encode($returnData);
break;
}
function ZipFolder($source, $destination)
{
if (!extension_loaded(‘zip’) || !file_exists($source)) {
return false;
}
$zip = new ZipArchive();
if (!$zip->open($destination, ZIPARCHIVE::CREATE)) {
return false;
}
$source = str_replace('\\', '/', realpath($source));
if (is_dir($source) === true)
{
$files = new RecursiveIteratorIterator(new RecursiveDirectoryIterator($source), RecursiveIteratorIterator::SELF_FIRST);
foreach ($files as $file)
{
$file = str_replace('\\', '/', $file);
// Ignore "." and ".." folders
if( in_array(substr($file, strrpos($file, '/')+1), array('.', '..')) )
continue;
$file = realpath($file);
if (is_dir($file) === true)
{
$zip->addEmptyDir(str_replace($source . '/', '', $file . '/'));
}
else if (is_file($file) === true)
{
$zip->addFromString(str_replace($source . '/', '', $file), file_get_contents($file));
}
}
}
else if (is_file($source) === true)
{
$zip->addFromString(basename($source), file_get_contents($source));
}
return $zip->close();
}
function rrmdir($dir) {
system("rm -rf ".escapeshellarg($dir));
}
?>[/php]
and this is the code from harvest.php
[php]<?php
// It may take a whils to crawl a site …
set_time_limit(10000);
// Include the phpcrawl-mainclass
include(“libs/PHPCrawler.class.php”);
//variables
$theurl = $_GET[“theurl”];
$follow_mode = 3;
$_SESSION[‘links’] = array();
$requestid = str_replace(".", “”, uniqid("", true));
// Extend the class and override the handleDocumentInfo()-method
class MyCrawler extends PHPCrawler
{
function handleDocumentInfo($DocInfo)
{
switch($DocInfo->http_status_code){
case 200 : $http_status = “<span class=“label label-success”>200”; break;
case 401 : $http_status = “<span class=“label label-danger”>401”; break;
case 404 : $http_status = “<span class=“label label-danger”>404”; break;
case 301 : $http_status = “<span class=“label label-warning”>301”; break;
default: $http_status = “<span class=“label label-info”>”.($DocInfo->http_status_code == ‘’ ? “??” : $DocInfo->http_status_code).""; break;
}
$content_type = $DocInfo->content_type;
if ($content_type == ‘’) {
$content_type = “text/html”;
}
echo "<tr>".
"<td align=\"center\">"."<input type=\"checkbox\" name=\"$DocInfo->file\" value=\"$DocInfo->path\" ".($DocInfo->http_status_code==200 ? "" : "disabled")."></td>".
"<td>$http_status</td>".
"<td>$content_type</td>".
"<td><a href=\"$DocInfo->url\">$DocInfo->url</a></td>".
"</tr>";
if (isset($_SESSION['links']))
$links = $_SESSION['links'];
else
$links = array();
array_push($links, $DocInfo);
$_SESSION['links'] = $links;
flush();
}
}
// Now, create a instance of your class, define the behaviour
// of the crawler (see class-reference for more options and details)
// and start the crawling-process.
$crawler = new MyCrawler();
// URL to crawl
$crawler->setURL($theurl);
//setting the follow mode
$crawler->setFollowMode($follow_mode);
$crawler->enableAggressiveLinkSearch(true);
$crawler->addContentTypeReceiveRule("#text/js#");
$crawler->addContentTypeReceiveRule("#text/html#");
$crawler->addContentTypeReceiveRule("#text/css#");
// Store and send cookie-data like a browser does
$crawler->enableCookieHandling(true);
// Set the traffic-limit to 1 MB (in bytes,
// for testing we dont want to “suck” the whole site)
$crawler->setTrafficLimit(5000 * 1024);
$crawler->setPageLimit(5000, false);
?>
<div class="btn-toolbar pull-left">
<label class="checkbox-inline">
<input type="checkbox" class="choiceCheckbox" value="html" checked> HTML
</label>
<label class="checkbox-inline">
<input type="checkbox" class="choiceCheckbox" value="image" checked> Images
</label>
<label class="checkbox-inline">
<input type="checkbox" class="choiceCheckbox" value="css" checked> CSS
</label>
<label class="checkbox-inline">
<input type="checkbox" class="choiceCheckbox" value="javascript" checked> JS
</label>
</div>
<input type="hidden" id="uniqueid" value="<?php echo $requestid; ?>">
Select | Status | Type | URL |
---|
and finally the code from harvester.php
[php]<?php
$theurl = $_GET[‘url’];
$htmlExts = array(“htm”, “html”, “php”, “asp”, “aspx”, “cfm”);
$jsExts = array(“js”, “json”);
$cssExts = array(“css”);
$imageExts = array(“png”, “jpeg”, “bmp”, “gif”, “psd”, “ico”, “jpg”);
ini_set(‘max_execution_time’, 60);
//harvestSite($theurl);
echo json_encode(harvestSite($theurl));
function harvestSite($url) {
$mainLinks = harvestLink($url);
$maxlinks = 10;
//scheme, host, url, filename, dirname, basename, extension, path
$morelinks = array_filter($mainLinks, "cssLinks");
$morelinks = array_slice($morelinks, 0, $maxlinks);
$m = sizeof($morelinks);
$maxlinks -= $m;
for($i=0; $i < $m; $i++) {
$found = harvestLink($morelinks[$i]['url']);
$mainLinks = array_unique(array_merge($mainLinks, $found), SORT_REGULAR);
}
return $mainLinks;
}
function cssLinks ($link) {
global $cssExts;
return in_array($link[‘extension’], $cssExts) && $link[‘isinternal’] == true;
}
function htmlLinks ($link) {
global $htmlExts;
return in_array($link[‘extension’], $htmlExts) && $link[‘isinternal’] == true;
}
function harvestLink ($url) {
if(validateURL($url) == false) {
return null;
}
global $htmlExts, $cssExts, $jsExts, $imageExts;
$urlDet = getURLStructure($url, $url);
$urlExt = $urlDet[‘extension’] != null ? $urlDet[‘extension’] : ‘html’;
$urlExt = strtolower($urlExt);
$links = array();
array_push($links, getURLStructure($url, $url));
if (in_array($urlExt, $imageExts)) {
return $links;
}
if (in_array($urlExt, $jsExts)) {
$jslinks = harvestFromJSContent($url);
if ($jslinks != null) {
$links = array_unique(array_merge($links, $jslinks), SORT_REGULAR);
}
} else if (in_array($urlExt, $cssExts)) {
$csslinks = harvestFromCSSContent($url);
if ($csslinks != null) {
$links = array_unique(array_merge($links, $csslinks), SORT_REGULAR);
}
} else if (in_array($urlExt, $htmlExts)) {
$htmllinks = harvestFromHTMLContent($url);
if ($htmllinks != null) {
$links = array_unique(array_merge($links, $htmllinks), SORT_REGULAR);
}
}
return $links;
}
function harvestFromHTMLContent ($url, $parenturl="") {
$text = file_get_contents($url, false, stream_context_create(array( ‘http’ => array(‘ignore_errors’ => true))));
if(!isset($text) || $text == “”) {
return null;
}
$dom = new DOMDocument();
@$dom->loadHTML($text);
$xpath = new DOMXPath($dom);
$linksArray = array();
$nodesSelector = “(//@href | //@src)[not(.=preceding:://@src)][not(.=preceding:://@href)]”;
if($parenturl != “”) {
$nodesSelector .= ‘[starts-with(self::node(), "’.$parenturl.’")]’;
}
$Links = $xpath->evaluate($nodesSelector);
for ($i = 0; $i < $Links->length; $i++) {
$Link = $Links->item($i)->value;
$Link = getURLStructure($Link, $url);
array_push($linksArray, $Link);
}
return $linksArray;
}
function harvestFromCSSContent($url) {
$text = file_get_contents($url, false, stream_context_create(array( ‘http’ => array(‘ignore_errors’ => true))));
if(!isset($text) || $text == “”) {
return null;
}
$urls = array( );
$url_pattern = '(([^\\\\\'", \(\)]*(\\\\.)?)+)';
$urlfunc_pattern = 'url\(\s*[\'"]?' . $url_pattern . '[\'"]?\s*\)';
$pattern = '/(' .
'(@import\s*[\'"]' . $url_pattern . '[\'"])' .
'|(@import\s*' . $urlfunc_pattern . ')' .
'|(' . $urlfunc_pattern . ')' . ')/iu';
if ( !preg_match_all( $pattern, $text, $matches ) )
return $urls;
// @import '...'
// @import "..."
foreach ( $matches[3] as $match ) {
if ( !empty($match) )
array_push($urls, getURLStructure(preg_replace( '/\\\\(.)/u', '\\1', $match ), $url));
}
foreach ( $matches[7] as $match ) {
if ( !empty($match) )
array_push($urls, getURLStructure(preg_replace( '/\\\\(.)/u', '\\1', $match ), $url));
}
// url(...)
// url('...')
// url("...")
foreach ( $matches[11] as $match ) {
if ( !empty($match) )
array_push($urls, getURLStructure(preg_replace( '/\\\\(.)/u', '\\1', $match ), $url));
}
return $urls;
}
function getURLStructure($url, $parenturl) {
$ret = parse_url($url);
$parentParse = parse_url($parenturl);
if (isset($parentParse[‘path’])) {
$parentParse = array_merge($parentParse, pathinfo($parentParse[‘path’]));
}
$t = $url;
global $htmlExts;
if(!isset($ret['host']) && $parenturl != "") {
$t = relativeToAbsoluteURL($parenturl, $url);
$ret = parse_url($t);
}
$ret['url'] = $t;
$ret['isinternal'] = true;
if (isset($ret['host'])) {
$ret['isinternal'] = (ltrim($ret['host'],"www.") == ltrim($parentParse['host'],"www.") ? true : false);
}
if (!isset($ret['extension'])) {
$ret['extension'] = "html";
}
if(isset($ret['path'])) {
$ret = array_merge($ret, pathinfo($ret['path']));
}
return $ret;
}
function validateURL ($url) {
return true;
}
//copied from http://toolspot.org/relative-path-into-absolute-url.php
function relativeToAbsoluteURL($base, $rel)
{
if(strpos($rel,"//")===0)
{
return “http:”.$rel;
}
/* return if already absolute URL /
if (parse_url($rel, PHP_URL_SCHEME) != ‘’) return $rel;
/ queries and anchors /
if ($rel[0]==’#’ || $rel[0]==’?’) return $base.$rel;
/ parse base URL and convert to local variables:
$scheme, $host, $path /
extract(parse_url($base));
/ remove non-directory element from path /
$path = preg_replace(’#/[^/]$#’, ‘’, $path);
/* destroy path if relative url points to root /
if ($rel[0] == ‘/’) $path = ‘’;
/ dirty absolute URL /
$abs = “$host$path/$rel”;
/ replace ‘//’ or ‘/./’ or ‘/foo/…/’ with ‘/’ /
$re = array(’#(/.?/)#’, ‘#/(?!..)[^/]+/…/#’);
for($n=1; $n>0; $abs=preg_replace($re, ‘/’, $abs, -1, $n)) {}
/ absolute URL is ready! */
return $scheme.’://’.$abs;
}
?>[/php]
if some one could help me fix this i would be very grateful! ;D
dont forget to post any other required info you might need as i have not provided much at all