robots.txt parser php class
for completeness, one from my other site
-
-
Class Robots {
-
-
var $Agents = array();
-
var $my_user_agent="User-agent: *"; //my useragent
-
-
public function __construct($domain) {
-
-
$MyAgent = Agents($this->$my_user_agent);
-
-
$robotsdomain=trim("http://".$domain);
-
$stripped_robotsdomain=str_replace("/","",$robotsdomain);
-
$robots=Read_Content(trim("http://".$domain).'/robots.txt');
-
$robots=explode("\n",$robots);
-
for ($i=0;$i<sizeof ($robots);$i++){
-
-
if (trim($robots[$i])==$this->my_user_agent){ // rules for agent: *
-
for ($checkrules=1;$checkrules<10;$checkrules++){
-
if (trim($robots[$i+$checkrules])!=""){
-
-
$pos = strpos( $current_line[$count],"User-agent");
-
if (is_integer($pos)) break;
-
-
$pos = strpos( $current_line[$count],"#");
-
if (is_integer($pos)) $current_line[$count]=substr($current_line[$count],0,$pos);
-
-
$disallow_line=str_replace("Disallow: ", "" ,$robots[$i+$checkrules]);
-
$disallow_line=str_replace("/", "" ,$disallow_line);
-
$Rule = $MyAgent->DisallowedDirectories($stripped_robotsdomain.$disallow_line);
-
-
$count++;
-
}
-
}
-
}
-
}
-
-
}
-
-
-
public function check($url) {
-
$forbidden=1;
-
$stripped_current_url=str_replace("/", "" ,$url);
-
$MyAgent = $this->Agents($my_user_agent);
-
foreach($MyAgent->DisallowedDirectories as $dir) {
-
if (preg_match("/".trim($dir->path)."/i",$stripped_current_url)) {$forbidden=0;}
-
}
-
return $forbidden;
-
}
-
-
public function Agents($code) {
-
if(!$this->Agents[$code]) {
-
$this->Agents[$code] = new Agent($code);
-
}
-
return $this->Agents[$code];
-
}
-
-
public function Read_Content($url){// Open een url return content
-
$handle=@fopen($url,"r");
-
if($handle){
-
$contents = fread ($handle, 10000);
-
fclose($handle);
-
}
-
return $contents;
-
}
-
-
}
-
-
Class Agent {
-
-
var $DisallowedDirectories = array();
-
-
public function DisallowedDirectories($code) {
-
if(!$this->DisallowedDirectories[$code]) {
-
$this->DisallowedDirectories[$code] = new DisallowedDirectory($code);
-
}
-
return $this->DisallowedDirectories[$code];
-
}
-
-
}
-
-
Class DisallowedDirectory {
-
-
var $path;
-
var $pathstripped;
-
-
-
public function __construct($apath) {
-
$this->path=$apath;
-
this->pathstripped=str_replace("/", "" ,$apath);
-
}
-
-
}
-
-
-
function robots_allowed($url){
-
$current_url=$url;
-
$xmp=explode("/", $current_url."/");
-
$robotsdomain=trim("http://".$xmp[2]);
-
$stipped_robotsdomain=str_replace("/","",$robotsdomain);
-
$stripped_current_url=str_replace("/", "" ,$url);
-
$my_user_agent="User-agent: *"; //my useragent
-
-
$robots=Read_Content($robotsdomain.'/robots.txt');
-
$robots=explode("\n",$robots);
-
for ($i=0;$i</sizeof><sizeof ($robots);$i++){
-
-
if (trim($robots[$i])==$my_user_agent){ // rules for agent: *
-
for ($checkrules=1;$checkrules<10;$checkrules++){
-
if (trim($robots[$i+$checkrules])!=""){
-
-
$pos = strpos( $current_line[$count],"User-agent");
-
if (is_integer($pos)) break;
-
-
$pos = strpos( $current_line[$count],"#");
-
if (is_integer($pos)) $current_line[$count]=substr($current_line[$count],0,$pos);
-
$disallow_line=str_replace("Disallow: ", "" ,$robots[$i+$checkrules]);
-
//$disallow_line=str_replace("http://", "" ,$disallow_line);
-
$disallow_line=str_replace("/", "" ,$disallow_line);
-
$newdata[$num]=$stipped_robotsdomain.$disallow_line;
-
-
-
$num++;
-
$count++;
-
}
-
}
-
}
-
}
-
-
$forbidden=1;
-
for ($last=0;$last<20;$last++){
-
if (trim($newdata[$last])!=""){
-
if (preg_match("/".trim($newdata[$last])."/i",$stripped_current_url)) {$forbidden=0;}
-
}
-
}
-
return $forbidden;
-
}
usage
-
$thisurl = "http://www.juust.org/seo/blackwidow/index.php";
-
$rb = new Robots(parse_url($thisurl, PHP_URL_HOST));
-
$rb->check($thisurl);
that should return 0, disallowed
the advantage of using a class, I parse the file only once, so if I am spidering I dont have to parse the dumb text every time to check if a file is ‘accessible’. It’s just a rough sketch.
this is the original code :
# Original PHP code by Chirp Internet: www.chirp.com.au
# Please acknowledge use of this code by including this header.
# http://www.the-art-of-web.com/php/parse-robots/
-
function PHProbots_allowed($url, $useragent=false) {
-
-
# parse url to retrieve host and path
-
$parsed = parse_url($url);
-
$agents = array(preg_quote('*'));
-
if($useragent) $agents[] = preg_quote($useragent);
-
$agents = implode('|', $agents);
-
-
# location of robots.txt file
-
$robotstxt = @file("http://{$parsed['host']}/robots.txt");
-
if(!$robotstxt) return true;
-
$rules = array();
-
$ruleapplies = false;
-
-
foreach($robotstxt as $line) {
-
# skip blank lines if(!$line = trim($line)) continue;
-
# following rules only apply if User-agent matches $useragent or '*'
-
if(preg_match('/User-agent: (.*)/i', $line, $match)) {
-
$ruleapplies = preg_match("/($agents)/i", $match[1]);
-
}
-
if($ruleapplies && preg_match('/Disallow:(.*)/i', $line, $regs)) {
-
# an empty rule implies full access - no further tests required
-
if(!$regs[1]) return true;
-
# add rules that apply to array for testing
-
$rules[] = preg_quote(trim($regs[1]), '/');
-
}
-
}
-
foreach($rules as $rule) {
-
# check if page is disallowed to us
-
if(preg_match("/^$rule/", $parsed['path'])) return false;
-
}
-
# page is not disallowed
-
return true;
-
}
-
-
}






