juust ~ php oddities

Unordered list of one element
  • rss
  • begin
  • about
    • vcard
    • WTF is BroJesus
  • php scripts
    • flickr wp widget
    • google multi key serp tool, php script
    • gwt plugin
  • php classes
    • php pagerank class
    • fibonacci class
    • robots.txt parser php class
  • serp
    • serp dashboard wordpress plugin
  • services

robots.txt parser php class

for completeness, one from my other site

  1.  
  2. Class Robots {
  3.  
  4.  var $Agents = array();
  5.  var $my_user_agent="User-agent: *"; //my useragent
  6.  
  7.  public function __construct($domain) {
  8.  
  9.   $MyAgent = Agents($this->$my_user_agent);
  10.  
  11.   $robotsdomain=trim("http://".$domain);
  12.   $stripped_robotsdomain=str_replace("/","",$robotsdomain);
  13.   $robots=Read_Content(trim("http://".$domain).'/robots.txt');
  14.   $robots=explode("\n",$robots);
  15.   for ($i=0;$i<sizeof ($robots);$i++){
  16.  
  17.    if (trim($robots[$i])==$this->my_user_agent){ // rules for agent: *
  18.     for ($checkrules=1;$checkrules&lt;10;$checkrules++){
  19.      if (trim($robots[$i+$checkrules])!=""){
  20.  
  21.       $pos = strpos( $current_line[$count],"User-agent");
  22.       if (is_integer($pos)) break;
  23.  
  24.       $pos = strpos( $current_line[$count],"#");
  25.       if (is_integer($pos)) $current_line[$count]=substr($current_line[$count],0,$pos);
  26.  
  27.       $disallow_line=str_replace("Disallow: ", "" ,$robots[$i+$checkrules]);
  28.       $disallow_line=str_replace("/", "" ,$disallow_line);
  29.       $Rule = $MyAgent->DisallowedDirectories($stripped_robotsdomain.$disallow_line);
  30.  
  31.       $count++;
  32.       }
  33.      }
  34.     }
  35.    }
  36.  
  37.  }
  38.  
  39.  
  40.  public function check($url) {
  41.   $forbidden=1;
  42.   $stripped_current_url=str_replace("/", "" ,$url);
  43.   $MyAgent = $this->Agents($my_user_agent);
  44.   foreach($MyAgent->DisallowedDirectories as $dir) {
  45.     if (preg_match("/".trim($dir->path)."/i",$stripped_current_url)) {$forbidden=0;}
  46.   }
  47.   return $forbidden;
  48.  }
  49.  
  50.  public function Agents($code) {
  51.   if(!$this->Agents[$code]) {
  52.    $this->Agents[$code] = new Agent($code);
  53.   }
  54.   return $this->Agents[$code];
  55.  }
  56.  
  57.         public function Read_Content($url){// Open een url return content
  58.          $handle=@fopen($url,"r");
  59.          if($handle){
  60.            $contents = fread ($handle, 10000);
  61.            fclose($handle);
  62.   }
  63.           return $contents;
  64.  }
  65.  
  66. }
  67.  
  68. Class Agent {
  69.  
  70.  var $DisallowedDirectories = array();
  71.  
  72.  public function DisallowedDirectories($code) {
  73.   if(!$this->DisallowedDirectories[$code]) {
  74.    $this->DisallowedDirectories[$code] = new DisallowedDirectory($code);
  75.   }
  76.   return $this->DisallowedDirectories[$code];
  77.  }
  78.  
  79. }
  80.  
  81. Class DisallowedDirectory {
  82.  
  83.  var $path;
  84.  var $pathstripped;
  85.  
  86.  
  87.  public function __construct($apath) {
  88.   $this->path=$apath;
  89.   this->pathstripped=str_replace("/", "" ,$apath);
  90.  }
  91.  
  92. }
  93.  
  94.  
  95. function robots_allowed($url){
  96.  $current_url=$url;
  97.  $xmp=explode("/", $current_url."/");
  98.  $robotsdomain=trim("http://".$xmp[2]);
  99.  $stipped_robotsdomain=str_replace("/","",$robotsdomain);
  100.  $stripped_current_url=str_replace("/", "" ,$url);
  101.  $my_user_agent="User-agent: *"; //my useragent
  102.  
  103.  $robots=Read_Content($robotsdomain.'/robots.txt');
  104.  $robots=explode("\n",$robots);
  105.  for ($i=0;$i</sizeof><sizeof ($robots);$i++){
  106.  
  107.  if (trim($robots[$i])==$my_user_agent){ // rules for agent: *
  108.   for ($checkrules=1;$checkrules&lt;10;$checkrules++){
  109.    if (trim($robots[$i+$checkrules])!=""){
  110.  
  111.     $pos = strpos( $current_line[$count],"User-agent");
  112.     if (is_integer($pos)) break;
  113.  
  114.     $pos = strpos( $current_line[$count],"#");
  115.     if (is_integer($pos)) $current_line[$count]=substr($current_line[$count],0,$pos);
  116.     $disallow_line=str_replace("Disallow: ", "" ,$robots[$i+$checkrules]);
  117.     //$disallow_line=str_replace("http://", "" ,$disallow_line);
  118.     $disallow_line=str_replace("/", "" ,$disallow_line);
  119.     $newdata[$num]=$stipped_robotsdomain.$disallow_line;
  120.  
  121.  
  122.     $num++;
  123.     $count++;
  124.     }
  125.    }
  126.   }
  127.  }
  128.  
  129.  $forbidden=1;
  130.  for ($last=0;$last&lt;20;$last++){
  131.   if (trim($newdata[$last])!=""){
  132.    if (preg_match("/".trim($newdata[$last])."/i",$stripped_current_url)) {$forbidden=0;}
  133.   }
  134.  }
  135.  return $forbidden;
  136. }

usage

  1. $thisurl = "http://www.juust.org/seo/blackwidow/index.php";
  2. $rb = new Robots(parse_url($thisurl, PHP_URL_HOST));
  3. $rb->check($thisurl);

that should return 0, disallowed

the advantage of using a class, I parse the file only once, so if I am spidering I dont have to parse the dumb text every time to check if a file is ‘accessible’. It’s just a rough sketch.

this is the original code :
# Original PHP code by Chirp Internet: www.chirp.com.au
# Please acknowledge use of this code by including this header.
# http://www.the-art-of-web.com/php/parse-robots/

  1. function PHProbots_allowed($url, $useragent=false) {
  2.  
  3. # parse url to retrieve host and path
  4. $parsed = parse_url($url);
  5. $agents = array(preg_quote('*'));
  6. if($useragent) $agents[] = preg_quote($useragent);
  7. $agents = implode('|', $agents);
  8.  
  9. # location of robots.txt file
  10. $robotstxt = @file("http://{$parsed['host']}/robots.txt");
  11. if(!$robotstxt) return true;
  12. $rules = array();
  13. $ruleapplies = false;
  14.  
  15. foreach($robotstxt as $line) {
  16. # skip blank lines if(!$line = trim($line)) continue;
  17. # following rules only apply if User-agent matches $useragent or '*'
  18. if(preg_match('/User-agent: (.*)/i', $line, $match)) {
  19. $ruleapplies = preg_match("/($agents)/i", $match[1]);
  20. }
  21. if($ruleapplies && preg_match('/Disallow:(.*)/i', $line, $regs)) {
  22. # an empty rule implies full access – no further tests required
  23. if(!$regs[1]) return true;
  24. # add rules that apply to array for testing
  25. $rules[] = preg_quote(trim($regs[1]), '/');
  26. }
  27. }
  28. foreach($rules as $rule) {
  29. # check if page is disallowed to us
  30. if(preg_match("/^$rule/", $parsed['path'])) return false;
  31. }
  32. # page is not disallowed
  33. return true;
  34. }
  35.  
  36. }

         

Tags
class, php, robots
Comments rss
Comments rss
Trackback
Trackback

2 Responses to “robots.txt parser php class”

  1. Deepak Pradhan says:
    08/12/2009 at 7:22 pm

    if(preg_match(”/^$rule/”, $parsed['path'])) return false;

    fails if rule=/*/mac/help.mspx

    Reply
  2. juust says:
    10/12/2009 at 6:18 pm

    @Deepak,
    thanks for the feedback,

    some shell-wildcards like * are a modifier in php regex, that causes it to fail with a warning. I can use fnmatch() http://www.php.net/manual/en/function.fnmatch.php, which handles shell wildcards but it doesn’t work on non-posix systems. Php.net do offer a preg_match based replacement but that requires rewriting the class here and there.

    I noticed a few more bugs in it so I am going to put a new version on the blog next week.

    Reply

Leave a Reply

Click here to cancel reply.

Recent Posts

  • p2p with wordpress xml-rpc
  • Tweets on Google’s frontpage
  • happy new year
  • metaWeblog.newPost posting to Wordpress from Word
  • IE is retarded

click me!
rss
Comments rss
Blog Directory
Web Developement Blogs - BlogCatalog Blog Directory
Listed in LS Blogs the Blog Directory and Blog Search Engine
Blog Flux Directory
joopita.com free web directory and search engine
design by jide
sitemap
8076 confirmed spam kills