require 'nokogiri'
require 'open-uri'
require 'uri'
class NokoCrawl
attr_reader :urls_processed
def initialize(url)
unless url_valid? url
puts "Initial URL is not valid.\n"
exit
end
@urls_processed = []
@urls_not_processed = []
@urls_external = []
@urls_invalid = []
@urls_error = []
@url_last_scanned = nil
# define ignore list, array of regex
# todo: turn this into an argument
@urls_ignore_list = [
/posts\/.*?\/comments\/.*?\/new/
]
process_initial_url url
# add first url to process queue
@urls_not_processed << url
end
# method to start scanning
def start_scan
while !@urls_not_processed.empty?
url = @urls_not_processed.shift
urls = scan_url url
process_scanned_urls urls
end
end
# simple url validation
def url_valid?(url)
(url =~ URI::regexp).nil? ? false : true
end
# processes first url and sets instance variables
def process_initial_url(url)
@url_initial = url
uri = URI(@url_initial)
@url_host = uri.host
@url_scheme_host = "#{uri.scheme}://#{uri.host}"
end
# scans url from queue using nokogiri
def scan_url(url)
@urls_processed << url
@url_last_scanned = url
begin
doc = Nokogiri::HTML(open(url))
rescue
@urls_error << url
return []
end
doc.css('a').collect {|a| a['href']}
end
# loop through scanned urls, post-process, ignore, store
def process_scanned_urls(urls)
# todo: does not work with urls that start with '../'
# todo: remove trailing slash?
# todo: ignore/remove anchors from urls?
urls.each do |url|
# ignore urls that start with '#'
if url =~ /^#/
next
# ignore urls that start with 'javascript:'
elsif url =~ /^javascript:/
next
# ignore urls that start with 'mailto:'
elsif url =~ /^mailto:/
next
# check for nil links
elsif url.nil?
next
end
# check for internal link, starts with '/'
if url =~ /^\//
url = @url_scheme_host + url
end
# check for relative links beginning with '../'
if url =~ /^\.\.\//
unless @urls_invalid.include? url
@urls_invalid << url
end
next
end
# check for relative links
unless url =~ /^(http|https):\/\//
url = url_with_trailing_slash(@url_last_scanned) + url
end
# check for invalid urls
unless url_valid? url
unless @urls_invalid.include? url
@urls_invalid << url
end
next
end
# check if url has already been scanned
if @urls_processed.include? url
next
end
# check if url is queued to be processed
if @urls_not_processed.include? url
next
end
# check ignore list
url_ignored = false
@urls_ignore_list.each do |regex|
if regex =~ url
url_ignored = true
break
end
end
if url_ignored
next
end
uri = URI(url)
# check for external link
if @url_host != uri.host
unless url_hosts_same? uri.host, @url_host
unless @urls_external.include? url
@urls_external << url
end
next
end
end
# add url to list to process
@urls_not_processed << url
end
end
# method to check if scanned domains are the same, or internal
# note: ericlondon.com == www.ericlondon.com
def url_hosts_same? (url_1, url_2)
if url_1.nil? || url_2.nil?
return false
end
url_1_split = url_1.split '.'
url_2_split = url_2.split '.'
# note: checks for domains with at least 1 period;
# example: example.com
# localhost will not work
unless url_1_split.size > 1 && url_2_split.size >2
return false
end
url_1_base = url_1_split.pop(2).join('.')
url_2_base = url_2_split.pop(2).join('.')
return url_1_base == url_2_base
end
# method that return the last scanned url with a trailing slash
# note: removes "index.html" from url structure: http://example.com/test/index.html
def url_with_trailing_slash(url)
if url[-1..-1] == '/'
return url
else
# remove everything after last '/'
uri = URI(url)
uri_path_parts = uri.path.split '/'
uri_path_parts.pop
return uri.scheme + '://' + uri.host + uri_path_parts.join('/') + '/'
end
end
end
noko = NokoCrawl.new 'http://example.com'
noko.start_scan
p noko.urls_processed.inspect
<?php
class Crawl {
protected $regex_link;
protected $website_url;
protected $website_url_base;
protected $urls_processed;
protected $urls_external;
protected $urls_not_processed;
protected $urls_ignored;
public function __construct($website_url = NULL) {
// enable error tracking, grr.
ini_set('track_errors', true);
// setup variables
$this->regex_link = "/<\s*a\s+[^>]*href\s*=\s*[\"']?([^\"' >]+)[\"' >]/isU";
$this->urls_processed = array();
$this->urls_external = array();
$this->urls_not_processed = array();
$this->urls_ignored = array(
'/search/apachesolr_search/',
'/comment/reply/',
);
// validate argument(s)
$result = $this->validate_arg_website_url($website_url);
// error check
if (!$result) {
return FALSE;
}
// set website argument
$this->website_url = $website_url;
// get url base
$url_base = $this->get_url_base($this->website_url);
// error check
if (!$url_base) {
return;
}
// set website url base
$this->website_url_base = $url_base;
// add url to list of urls to process
$this->urls_not_processed[] = $this->website_url;
while(count($this->urls_not_processed)) {
$this->process_urls_not_processed();
}
// sort data
sort($this->urls_processed);
sort($this->urls_external);
}
protected function validate_arg_website_url($website_url = NULL) {
// validate argument
if (!(is_string($website_url) && (substr($website_url,0,7)=='http://' || substr($website_url,0,8)=='https://'))) {
return FALSE;
}
return TRUE;
}
protected function get_url_base($url = NULL) {
// validate url
if (!$url || !strlen($url)) {
return FALSE;
}
$url_parts = parse_url($url);
// validate
if (!is_array($url_parts)) {
return FALSE;
}
// explode host on '.'
$exploded = explode('.', $url_parts['host']);
// return host and domain extension
$url_base = $exploded[count($exploded)-2] . '.' . $exploded[count($exploded)-1];
return $url_base;
}
protected function scan_url($url) {
// validate url
if (!is_string($url) || !$url || !strlen($url)) {
return FALSE;
}
// ensure url has not already been processed
if (in_array($url, $this->urls_processed)) {
return FALSE;
}
// add url to processed list
$this->urls_processed[] = $url;
// remove any previously saved errors
unset($php_errormsg);
// load page contents
$page_contents = file_get_contents($url);
// check for error when loading url; text starting with "file_get_contents"
$error_text = 'file_get_contents';
if (isset($php_errormsg) && substr($php_errormsg,0,strlen($error_text))==$error_text) {
return FALSE;
}
// check for additional errors
elseif ($page_contents === false || !strlen($page_contents)) {
return FALSE;
}
// execute regex
preg_match_all($this->regex_link, $page_contents, $matches);
if (is_array($matches) && isset($matches[1])) {
return array_unique($matches[1]);
}
return FALSE;
}
protected function process_matches($matches = NULL) {
// validate
if (!$matches || !is_array($matches) || empty($matches)) {
return FALSE;
}
foreach ($matches as $match) {
// ensure match exists
if (empty($match)) {
continue;
}
// ignore anchors
elseif (substr($match,0,1)=='#') {
continue;
}
// ignore javascript
elseif (substr($match,0,11)=='javascript:') {
continue;
}
// ignore mailto
elseif (substr($match,0,7)=='mailto:') {
continue;
}
// check for internal urls that begin with '/'
if (substr($match,0,1)=='/') {
$match = 'http://' . $this->website_url_base . $match;
}
// remove trailing slash
if (substr($match, -1)=='/') {
$match = substr($match, 0, -1);
}
// ensure href starts with http or https
// NOTE: this needs work, URL could begin with relative paths like '../', ftp://, etc.
if (!(substr($match,0,7)=='http://' || substr($match,0,8)=='https://')) {
$match = 'http://' . $this->website_url_base . '/' . $match;
}
// check if url is to be ignored
foreach ($this->urls_ignored as $ignored) {
if (stripos($match, $ignored) !== FALSE) {
continue 2;
}
}
// get url base
$url_base = $this->get_url_base($match);
// check for external url
if ($url_base != $this->website_url_base) {
if (!in_array($match, $this->urls_external)) {
$this->urls_external[] = $match;
}
continue;
}
// check if url has already been processed
if (in_array($match, $this->urls_processed)) {
continue;
}
// add url to list of urls to process
if (!in_array($match, $this->urls_not_processed)) {
$this->urls_not_processed[] = $match;
}
// end: foreach
}
return TRUE;
}
protected function process_urls_not_processed() {
if (empty($this->urls_not_processed)) {
return FALSE;
}
// get unprocessed url
$url = array_shift($this->urls_not_processed);
// scan url
$matches = $this->scan_url($url);
// error check
if (!$matches || !is_array($matches) || empty($matches)) {
return FALSE;
}
$this->process_matches($matches);
}
public function output_all_urls() {
echo "===== INTERNAL URLS =====\n";
foreach ($this->urls_processed as $url) {
print $url . "\n";
}
echo "===== EXTERNAL URLS =====\n";
foreach ($this->urls_external as $url) {
print $url . "\n";
}
}
}
?><?php
$website_url = 'http://www.example.com';
$crawl = new Crawl($website_url);
$crawl->output_all_urls();
?>