background image
HomeRecent PostsDrupalSearchTagsRSSContactAboutAccount
Eric.London's picture

Here's a simple PHP class I wrote to crawl a URL and return a list of internal and external URLs. I've used it in the past for development purposes [only] to find 404s and repetition in URL structure. IE: It does not read in robots.txt files or obey any similar rules. Just thought I'd pull it out of the archives and share on the web..

#!/usr/bin/php

<?php
class Crawl {

  protected
$regex_link;
  protected
$website_url;
  protected
$website_url_base;
  protected
$urls_processed;
  protected
$urls_external;
  protected
$urls_not_processed;
  protected
$urls_ignored;

  public function
__construct($website_url = NULL) {
 
   
// enable error tracking, grr.
   
ini_set('track_errors', true);
   
   
// setup variables
   
$this->regex_link = "/<\s*a\s+[^>]*href\s*=\s*[\"']?([^\"' >]+)[\"' >]/isU";
   
$this->urls_processed = array();
   
$this->urls_external = array();
   
$this->urls_not_processed = array();
   
$this->urls_ignored = array(
     
'/search/apachesolr_search/',
     
'/comment/reply/',
    );
   
   
// validate argument(s)
   
$result = $this->validate_arg_website_url($website_url);
       
   
// error check
   
if (!$result) {
      return
FALSE;
    }
   
   
// set website argument
   
$this->website_url = $website_url;
   
   
// get url base
   
$url_base = $this->get_url_base($this->website_url);
   
   
// error check
   
if (!$url_base) {
      return;
    }
   
   
// set website url base
   
$this->website_url_base = $url_base;
   
   
// add url to list of urls to process
   
$this->urls_not_processed[] = $this->website_url;
   
    while(
count($this->urls_not_processed)) {
     
$this->process_urls_not_processed();
    }
   
   
// sort data
   
sort($this->urls_processed);
   
sort($this->urls_external);
   
  }
 
  protected function
validate_arg_website_url($website_url = NULL) {
 
   
// validate argument
   
if (!(is_string($website_url) && (substr($website_url,0,7)=='http://' || substr($website_url,0,8)=='https://'))) {
      return
FALSE;
    }

    return
TRUE;   
     
  }
 
  protected function
get_url_base($url = NULL) {
 
   
// validate url
   
if (!$url || !strlen($url)) {
      return
FALSE;
    }
   
   
$url_parts = parse_url($url);
   
   
// validate
   
if (!is_array($url_parts)) {
      return
FALSE;
    }
   
   
// explode host on '.'
   
$exploded = explode('.', $url_parts['host']);
   
   
// return host and domain extension
   
$url_base = $exploded[count($exploded)-2] . '.' . $exploded[count($exploded)-1];
   
   
    return
$url_base;

  }

  protected function
scan_url($url) {

   
// validate url
   
if (!is_string($url) || !$url || !strlen($url)) {
      return
FALSE;
    }

   
// ensure url has not already been processed
   
if (in_array($url, $this->urls_processed)) {
      return
FALSE;
    }
   
   
// add url to processed list
   
$this->urls_processed[] = $url;

   
// remove any previously saved errors
   
unset($php_errormsg);
   
   
// load page contents
   
$page_contents = file_get_contents($url);       

   
// check for error when loading url; text starting with "file_get_contents"
   
$error_text = 'file_get_contents';
    if (isset(
$php_errormsg) && substr($php_errormsg,0,strlen($error_text))==$error_text) {
      return
FALSE;
    }

   
// check for additional errors
   
elseif ($page_contents === false || !strlen($page_contents)) {
      return
FALSE;
    }

   
// execute regex
   
preg_match_all($this->regex_link, $page_contents, $matches);
  
    if (
is_array($matches) && isset($matches[1])) {
      return
array_unique($matches[1]);
    }
  
    return
FALSE;

  }
 
  protected function
process_matches($matches = NULL) {
 
   
// validate
   
if (!$matches || !is_array($matches) || empty($matches)) {
      return
FALSE;
    }
   
    foreach (
$matches as $match) {
     
     
// ensure match exists
     
if (empty($match)) {
        continue;
      }
     
// ignore anchors
     
elseif (substr($match,0,1)=='#') {
        continue;
      }
     
// ignore javascript
     
elseif (substr($match,0,11)=='javascript:') {
        continue;
      }
     
// ignore mailto
     
elseif (substr($match,0,7)=='mailto:') {
        continue;
      }

     
// check for internal urls that begin with '/'
     
if (substr($match,0,1)=='/') {
       
$match = 'http://' . $this->website_url_base . $match;
      }
     
     
// remove trailing slash
     
if (substr($match, -1)=='/') {
       
$match = substr($match, 0, -1);
      }
     
     
// ensure href starts with http or https
      // NOTE: this needs work, URL could begin with relative paths like '../', ftp://, etc.
     
if (!(substr($match,0,7)=='http://' || substr($match,0,8)=='https://')) {
       
$match = 'http://' . $this->website_url_base . '/' . $match;
      }

     
// check if url is to be ignored
     
foreach ($this->urls_ignored as $ignored) {
        if (
stripos($match, $ignored) !== FALSE) {
          continue
2;
        }
      }

     
// get url base
     
$url_base = $this->get_url_base($match);
     
     
// check for external url
     
if ($url_base != $this->website_url_base) {
     
        if (!
in_array($match, $this->urls_external)) {
         
$this->urls_external[] = $match;
        }
        continue;
     
      }
     
     
// check if url has already been processed
     
if (in_array($match, $this->urls_processed)) {
        continue;
      }

     
// add url to list of urls to process
     
if (!in_array($match, $this->urls_not_processed)) {
       
$this->urls_not_processed[] = $match;
      }     
   
   
// end: foreach 
   
}
   
    return
TRUE;
 
  }
 
  protected function
process_urls_not_processed() {
 
    if (empty(
$this->urls_not_processed)) {
      return
FALSE;
    }
 
   
// get unprocessed url
   
$url = array_shift($this->urls_not_processed);
   
   
// scan url
   
$matches = $this->scan_url($url);

   
// error check
   
if (!$matches || !is_array($matches) || empty($matches)) {
      return
FALSE;
    }
 
   
$this->process_matches($matches);
 
  }
 
  public function
output_all_urls() {
 
    echo
"===== INTERNAL URLS =====\n";
    foreach (
$this->urls_processed as $url) {
      print
$url . "\n";
    }
 
    echo
"===== EXTERNAL URLS =====\n";
    foreach (
$this->urls_external as $url) {
      print
$url . "\n";
    }
 
  }

}
?>

It can be used as such..

<?php
$website_url
= 'http://www.example.com';
$crawl = new Crawl($website_url);
$crawl->output_all_urls();
?>

In this tutorial, I'll show you how you can expose your search form on another site using jQuery. At first, I thought about scraping the form's html using AJAX.. and quickly remembered you cannot easily do that. Which lead me to review the AJAX functionality included in jQuery. Bingo, one of my favorites: jQuery.getJSON. To summarize this code, I create a callback function to display the form's json-ified html which can then be easily embedded on another site.

First I defined the menu hook:

<?php
function MYMODULE_menu() {

 
$items = array();

 
// add a page callback for the url: "external-search.js"
 
$items['external-search.js'] = array(
   
'page callback' => '_MYMODULE_external_search',
   
'type' => MENU_CALLBACK,
   
'access arguments' => array('search content'),
  );
   
  return
$items;
   
}
?>

Then I created the callback function for the menu callback:

<?php
function _MYMODULE_external_search() {

 
// create a json string of the search form html
 
$json = drupal_to_js(drupal_get_form('search_form'));
   
 
// format the json as a callback function
  // see: http://docs.jquery.com/Ajax/jQuery.getJSON for more information
 
if ($_GET['jsoncallback']) {
   
$json = $_GET['jsoncallback'] . "(" . $json . ");";
  }
   
 
// output the json
 
print $json;

 
// stop the script, so the theme layer is not applied
 
die;
}
?>

One problem though, the form submits locally. That can be fixed using a form_alter function:

<?php
function MYMODULE_form_alter(&$form, $form_state, $form_id) {
   
 
// check for external search form and set form action to be full path
 
if ($form_id == 'search_form' && arg(0)=='external-search.js') {
   
// change the form action to be the full path
   
$form['#action'] = 'http://' . $_SERVER['HTTP_HOST'] . $form['#action'];
  }
}
?>

Now, if you clear your cache and go to http://YOURSITE/external-search.js, you should see the JSON (and nothing else).

Lastly, you can embed the code on another site using a few lines of jQuery. You can even pull the jQuery from your site if the external site does not have jQuery included.

<!-- Include jQuery (as necessary) -->
<script type='text/javascript' src='http://YOURSITE/misc/jquery.js' ></script>

<!-- create a div container to contain the search form -->
<div id='embedded_search'></div>

<!-- add the jQuery to embed the form -->
<script type='text/javascript'>
$(document).ready(function(){
  // make the ajax request
  $.getJSON("http://YOURSITE/external-search.js?jsoncallback=?",
    function(data){
      // append the form to the container
      $('#embedded_search').append(data);           
    }
  );
});
</script>

Now people should be able to access your site's search form from another site!

Syndicate content