robots.txt parser php class

for completeness, one from my other site

  1.  
  2. Class Robots {
  3.  
  4.  var $Agents = array();
  5.  var $my_user_agent="User-agent: *"; //my useragent
  6.  
  7.  public function __construct($domain) {
  8.  
  9.   $MyAgent = Agents($this->$my_user_agent);
  10.  
  11.   $robotsdomain=trim("http://".$domain);
  12.   $stripped_robotsdomain=str_replace("/","",$robotsdomain);
  13.   $robots=Read_Content(trim("http://".$domain).'/robots.txt');
  14.   $robots=explode("\n",$robots);
  15.   for ($i=0;$i<sizeof ($robots);$i++){
  16.  
  17.    if (trim($robots[$i])==$this->my_user_agent){ // rules for agent: *
  18.     for ($checkrules=1;$checkrules&lt;10;$checkrules++){
  19.      if (trim($robots[$i+$checkrules])!=""){
  20.  
  21.       $pos = strpos( $current_line[$count],"User-agent");
  22.       if (is_integer($pos)) break;
  23.  
  24.       $pos = strpos( $current_line[$count],"#");
  25.       if (is_integer($pos)) $current_line[$count]=substr($current_line[$count],0,$pos);
  26.  
  27.       $disallow_line=str_replace("Disallow: ", "" ,$robots[$i+$checkrules]);
  28.       $disallow_line=str_replace("/", "" ,$disallow_line);
  29.       $Rule = $MyAgent->DisallowedDirectories($stripped_robotsdomain.$disallow_line);
  30.  
  31.       $count++;
  32.       }
  33.      }
  34.     }
  35.    }
  36.  
  37.  }
  38.  
  39.  
  40.  public function check($url) {
  41.   $forbidden=1;
  42.   $stripped_current_url=str_replace("/", "" ,$url);
  43.   $MyAgent = $this->Agents($my_user_agent);
  44.   foreach($MyAgent->DisallowedDirectories as $dir) {
  45.     if (preg_match("/".trim($dir->path)."/i",$stripped_current_url)) {$forbidden=0;}
  46.   }
  47.   return $forbidden;
  48.  }
  49.  
  50.  public function Agents($code) {
  51.   if(!$this->Agents[$code]) {
  52.    $this->Agents[$code] = new Agent($code);
  53.   }
  54.   return $this->Agents[$code];
  55.  }
  56.  
  57.         public function Read_Content($url){// Open een url return content
  58.          $handle=@fopen($url,"r");
  59.          if($handle){
  60.            $contents = fread ($handle, 10000);
  61.            fclose($handle);
  62.   }
  63.           return $contents;
  64.  }
  65.  
  66. }
  67.  
  68. Class Agent {
  69.  
  70.  var $DisallowedDirectories = array();
  71.  
  72.  public function DisallowedDirectories($code) {
  73.   if(!$this->DisallowedDirectories[$code]) {
  74.    $this->DisallowedDirectories[$code] = new DisallowedDirectory($code);
  75.   }
  76.   return $this->DisallowedDirectories[$code];
  77.  }
  78.  
  79. }
  80.  
  81. Class DisallowedDirectory {
  82.  
  83.  var $path;
  84.  var $pathstripped;
  85.  
  86.  
  87.  public function __construct($apath) {
  88.   $this->path=$apath;
  89.   this->pathstripped=str_replace("/", "" ,$apath);
  90.  }
  91.  
  92. }
  93.  
  94.  
  95. function robots_allowed($url){
  96.  $current_url=$url;
  97.  $xmp=explode("/", $current_url."/");
  98.  $robotsdomain=trim("http://".$xmp[2]);
  99.  $stipped_robotsdomain=str_replace("/","",$robotsdomain);
  100.  $stripped_current_url=str_replace("/", "" ,$url);
  101.  $my_user_agent="User-agent: *"; //my useragent
  102.  
  103.  $robots=Read_Content($robotsdomain.'/robots.txt');
  104.  $robots=explode("\n",$robots);
  105.  for ($i=0;$i</sizeof><sizeof ($robots);$i++){
  106.  
  107.  if (trim($robots[$i])==$my_user_agent){ // rules for agent: *
  108.   for ($checkrules=1;$checkrules&lt;10;$checkrules++){
  109.    if (trim($robots[$i+$checkrules])!=""){
  110.  
  111.     $pos = strpos( $current_line[$count],"User-agent");
  112.     if (is_integer($pos)) break;
  113.  
  114.     $pos = strpos( $current_line[$count],"#");
  115.     if (is_integer($pos)) $current_line[$count]=substr($current_line[$count],0,$pos);
  116.     $disallow_line=str_replace("Disallow: ", "" ,$robots[$i+$checkrules]);
  117.     //$disallow_line=str_replace("http://", "" ,$disallow_line);
  118.     $disallow_line=str_replace("/", "" ,$disallow_line);
  119.     $newdata[$num]=$stipped_robotsdomain.$disallow_line;
  120.  
  121.  
  122.     $num++;
  123.     $count++;
  124.     }
  125.    }
  126.   }
  127.  }
  128.  
  129.  $forbidden=1;
  130.  for ($last=0;$last&lt;20;$last++){
  131.   if (trim($newdata[$last])!=""){
  132.    if (preg_match("/".trim($newdata[$last])."/i",$stripped_current_url)) {$forbidden=0;}
  133.   }
  134.  }
  135.  return $forbidden;
  136. }

usage

  1. $thisurl = "http://www.juust.org/seo/blackwidow/index.php";
  2. $rb = new Robots(parse_url($thisurl, PHP_URL_HOST));
  3. $rb->check($thisurl);

that should return 0, disallowed

the advantage of using a class, I parse the file only once, so if I am spidering I dont have to parse the dumb text every time to check if a file is ‘accessible’. It’s just a rough sketch.

this is the original code :
# Original PHP code by Chirp Internet: www.chirp.com.au
# Please acknowledge use of this code by including this header.
# http://www.the-art-of-web.com/php/parse-robots/

  1. function PHProbots_allowed($url, $useragent=false) {
  2.  
  3. # parse url to retrieve host and path
  4. $parsed = parse_url($url);
  5. $agents = array(preg_quote('*'));
  6. if($useragent) $agents[] = preg_quote($useragent);
  7. $agents = implode('|', $agents);
  8.  
  9. # location of robots.txt file
  10. $robotstxt = @file("http://{$parsed['host']}/robots.txt");
  11. if(!$robotstxt) return true;
  12. $rules = array();
  13. $ruleapplies = false;
  14.  
  15. foreach($robotstxt as $line) {
  16. # skip blank lines if(!$line = trim($line)) continue;
  17. # following rules only apply if User-agent matches $useragent or '*'
  18. if(preg_match('/User-agent: (.*)/i', $line, $match)) {
  19. $ruleapplies = preg_match("/($agents)/i", $match[1]);
  20. }
  21. if($ruleapplies && preg_match('/Disallow:(.*)/i', $line, $regs)) {
  22. # an empty rule implies full access – no further tests required
  23. if(!$regs[1]) return true;
  24. # add rules that apply to array for testing
  25. $rules[] = preg_quote(trim($regs[1]), '/');
  26. }
  27. }
  28. foreach($rules as $rule) {
  29. # check if page is disallowed to us
  30. if(preg_match("/^$rule/", $parsed['path'])) return false;
  31. }
  32. # page is not disallowed
  33. return true;
  34. }
  35.  
  36. }

2 Comments

  1. if(preg_match(“/^$rule/”, $parsed[‘path’])) return false;

    fails if rule=/*/mac/help.mspx

  2. @Deepak,
    thanks for the feedback,

    some shell-wildcards like * are a modifier in php regex, that causes it to fail with a warning. I can use fnmatch() http://www.php.net/manual/en/function.fnmatch.php, which handles shell wildcards but it doesn’t work on non-posix systems. Php.net do offer a preg_match based replacement but that requires rewriting the class here and there.

    I noticed a few more bugs in it so I am going to put a new version on the blog next week.

Leave a Reply

Your email address will not be published. Required fields are marked *