blogger auto-poster

I needed to get my new linkdirectory’s pages indexed and crawled and google needs some stimulation.

So I take a blogger subdomain,
and a 700 category php link directory
and make a table PLD_TAGCLOUD(CAT_ID, POSTED, TAG, FULLPATH)

CREATE TABLE `PLD_TAGCLOUD` (
`ID` BIGINT( 11 ) NOT NULL ,
`CAT_ID` DOUBLE NOT NULL ,
`POSTED` DOUBLE NOT NULL ,
`LEVEL` DOUBLE NOT NULL ,
`TAG` VARCHAR( 250 ) NOT NULL ,
`FULLPATH` VARCHAR( 250 ) NOT NULL ,
PRIMARY KEY ( `ID` )
) ENGINE = MYISAM

I fill the table with a recursive tree traversal on the category table where “tag” is the ‘title’ field,
and I get the FULLPATH url by using the domain root and the path generated by traversing the tree :


function connect() {
	$DB_USER =  "";
	$DB_PASSWORD = "";
	$DB_HOST = "";
	$DB_DATA = "";
	$link =  mysql_connect($DB_HOST, $DB_USER, $DB_PASSWORD) or $error = mysql_error();
	if (!$link) {
	   	echo $error; 
		exit;
	} else {
    mysql_select_db($DB_DATA, $link) or $error = mysql_error();
	return $link;
	}
}

$link = connect();
$del="DELETE FROM `PLD_TAGCLOUD`";
$mydel = mysql_query($del, $link) or die(mysql_error());
@mysql_close($link);

$root='http://links.trismegistos.net';
$content .= read(0, $root, 1);

function read($rootid, $pathid, $thislevel) {
	$link = connect();
	$myqry = "SELECT * FROM `PLD_CATEGORY` WHERE `PARENT_ID`='".$rootid."'";
	$myres = mysql_query($myqry, $link) or die(mysql_error());
	if(mysql_num_rows($myres)<1) return;
	while($row=mysql_fetch_assoc($myres)) { 	
		$thispath= $pathid ."/".$row['TITLE_URL'];
		$link2 = connect();
		$add="INSERT INTO `PLD_TAGCLOUD` (`CAT_ID`, `LEVEL`, `TAG`, `FULLPATH`) VALUES ('".$row['ID']."', '".$thislevel."', '".htmlentities($row['TITLE'], ENT_QUOTES)."' ,'".$thispath."/')";
		$addit = mysql_query($add, $link2) or die(mysql_error());
		@mysql_close($link2);
		read($row['ID'], $thispath, $thislevel+1);
	}
	@mysql_close($link);
}

note : I also use a field level to store the depth of a category-page (o for root, 1 for main categories and mine goes down to 4)

Then we make a simple routine to grab the first record for posted=0,
grab the url
grab the title
grab 3 posts off of google-blogsearch on the TITLE, add em to an email,
add a link to the category page url,
mail(email, subject, message-body, headers)

and ofcourse the coupe-de-grace, the cronjob, 700 posts, 4 per hour, so in about 170 hours my entire site is listed on a nice juicy blog. Just for the hell of it i put the links of the blogsearch on ‘follow’ so my poor victims get a link as well.


function connect() {
	$DB_USER =  "";
	$DB_PASSWORD = "";
	$DB_HOST = "";
	$DB_DATA = "";
	$link =  mysql_connect($DB_HOST, $DB_USER, $DB_PASSWORD) or $error = mysql_error();
	if (!$link) {
	   	echo $error; 
		exit;
	} else {
    mysql_select_db($DB_DATA, $link) or $error = mysql_error();
	return $link;
	}
}

	$link = connect();
	$myqry = "SELECT * FROM `PLD_TAGCLOUD` WHERE `POSTED`='0' ORDER BY ID DESC";
	$myres = mysql_query($myqry, $link) or die(mysql_error());
	if(mysql_num_rows($myres)<1) return;
	while($row=mysql_fetch_assoc($myres)) { 	
		$myurl = $row['FULLPATH'];
		$mykey = urlencode($row['TAG']);
		$link2 = connect();
		$add="UPDATE `PLD_TAGCLOUD` SET `POSTED`='1' WHERE `ID`='".$row['ID']."'";
		$addit = mysql_query($add, $link2) or die(mysql_error());
		@mysql_close($link2);
		break;
	}
	@mysql_close($link);


$xmlSource="http://blogsearch.google.com/blogsearch_feeds?hl=en&c2coff=1&lr=&safe=active&as_drrb=q&as_qdr=d&q=".$mykey."&ie=utf-8&num=3&output=rss";
$title="";
$link="";
$description="";
$author="";
$pubDate="";
$currentElement="";
$nieuwsitems = array();

function startElement($parser,$name,$attr){
	if(strcmp($name,"item")==0){
	$GLOBALS['title']="";
	$GLOBALS['link']="";
	$GLOBALS['description']="";
	$GLOBALS['author']="";
	$GLOBALS['pubDate']="";
	}
	$GLOBALS['currentElement']=$name;	
	if(strcmp($name,"link")==0){ $GLOBALS['href']=$attr["href"]; }

}

function endElement($parser,$name){
	$elements=array('title','link','description','author','pubDate');     
	if(strcmp($name,"item")==0){
		foreach($elements as $element){
			$temp[$element] = $GLOBALS[$element];							
		}
	$GLOBALS['nieuwsitems'][]=$temp;
	$GLOBALS['title']="";
	$GLOBALS['link']="";
	$GLOBALS['description']="";
	$GLOBALS['author']="";
	$GLOBALS['pubDate']="";
	}
	if(strcmp($name,"item")==0){
		$GLOBALS['title']="";
		$GLOBALS['link']="";
		$GLOBALS['description']="";
		$GLOBALS['author']="";
		$GLOBALS['pubDate']="";
	}
}

function characterData($parser, $data) {
	$elements = array ('title', 'link', 'description','author','pubDate');
	foreach ($elements as $element) {
		if ($GLOBALS["currentElement"] == $element) {
			$GLOBALS[$element] .= $data;
		}
	}
}

function parseFile(){
	global $xmlSource,$nieuwsitems;
	$xml_parser=xml_parser_create();
	xml_set_element_handler($xml_parser,"startElement","endElement");
	xml_set_character_data_handler($xml_parser,"characterData");
	xml_parser_set_option($xml_parser,XML_OPTION_CASE_FOLDING,false);
	if(!($fp=fopen($xmlSource,"r"))){
		die("Cannot open  $xmlSource  ");
	}
	while(($data=fread($fp,4096))){
		if(!xml_parse($xml_parser,$data,feof($fp))){
			die(sprintf("XML error at line %d column %d ", 
			xml_get_current_line_number($xml_parser), 
			xml_get_current_column_number($xml_parser)));
		}
	}
	xml_parser_free($xml_parser);
	return $nieuwsitems;
}

$result = parseFile();

foreach($result as $arr){
	$strResult .= '< hr />';
	$strResult .= '< h4>'.$arr["title"].'< /h4>'.$arr["description"].'< br />"'.$arr["title"].' ('.parse_url($arr["link"], PHP_URL_HOST).')< br />< br />';	
}

$strResult .= '< br /> < a href="'.$myurl.'" title="'.$mykey.'">trismegistos links : '.$mykey.'< /a>< br />'; 

$email='juustout.linkdirectory@blogger.com';
$subject = $mykey;
mail($email,$subject,$strResult, "MIME-Version: 1.0\n"."Content-type: text/html; charset=iso-8859-1");

echo $strResult;

(note the html markup in the last lines is < br />, if you cut and paste it, remove the space or you get a mess, also note the extra header in the php mail function, makes it possible to post html-marked up text (otherwise you get flat text posted and your site looks like ****).

tool : pagerank per url from a sitemap

I wired a google pagerank toolbar-query snippet to a simplexml sitemap readout, and put it on a page. You can fill in a sitemap url and get the google pageranks of all ‘mapped’ urls.

It works, I stripped it down and you can download it here or on the sample page.

I mainly wanted the snippet wired to a sitemap to compare the results of my pagerank spider tool with an actual google readout. Running a sitemap through a toolbar query snippet is the fastest way.

I allready had a spider result of siteometrics (calc pr) so now I can compare it to google’s toolbar query on http://www.siteometrics.com/sitemap.xml :

google pr calc pr URL
2 http://www.siteometrics.com/
2 0.80 /index.php
0 0.32 /advertise.html
0.77 /recommend.php
0.75 /search-engine-saturation.php
0 0.75 /link-popularity.php
0 0.75 /pagerank.php
0 0.75 /bulk-pagerank.php
0 0.75 /pagerank-mult-pages.php
0 0.75 /link-pop-pagerank.php
0.75 /link-search-pagerank.php
0 0.75 /alexa.php
0 0.75 /bulk-alexa.php
0 0.75 /serpcheck.php
0 0.75 /keyword-research.php
0 0.67 /visitor-info.php
0.24 /useful-links.html
0 0.24 /contact-us.html
0.24 /sitemap.html
0.24 /privacy-policy.html

Weird result, the sitemap they issue is part old site, part new site. If you check the pageranks on the newer .php files it’s the same, though.

a quarter of the urls link into the archived site, that might cause the drop in pagerank (links to /feed and google.com on every page, see the other article on siteometrics).


for the freaks : here’s the php code (assume url is a valid sitemap-url).


$myurl=$_REQUEST['url'];
$xml = simplexml_load_file($myurl);
foreach($xml->url as $u) echo pagerank((string) $u->loc)."
"; exit; function pagerank($url) { if (!preg_match('/^(http:\/\/)?([^\/]+)/i', $url)) { $url='http://'.$url; } $pr=curl_getpr($url); return $pr.';'.$url.';'; } function getch($url) { return CheckHash(HashURL($url)); } function curl_getpr($url) { $googlehost='toolbarqueries.google.com'; $googleua='Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.0.6) Gecko/20060728 Firefox/1.5'; $ch = getch($url); $form="http://toolbarqueries.google.com/search?client=navclient-auto&ch=$ch&features=Rank&q=info:$url"; $cr = curl_init($form); curl_setopt($cr, CURLOPT_FAILONERROR, true); curl_setopt($cr, CURLOPT_HEADER, 0); curl_setopt($cr, CURLOPT_USERAGENT, $googleua); // Spoof the user-agent curl_setopt($cr, CURLOPT_RETURNTRANSFER, true); $data = curl_exec($cr); if(!$data) { curl_close($cr); unset($cr); $pr='-'; return $pr; } else { $pos = strpos($data, "Rank_"); if($pos === false) { curl_close($cr); unset($cr); $pr='-'; return $pr; } else{ $pr=substr($data, $pos + 9); $pr=trim($pr); $pr=str_replace("\n",'',$pr); curl_close($cr); unset($cr); return $pr; } } } //PageRank Lookup v1.1 by HM2K (update: 31/01/07) //based on an algorithm found at: http://pagerank.gamesaga.net/ //live demo: http://www.highrankforum.com/pagerank.php //convert a string to a 32-bit integer function StrToNum($Str, $Check, $Magic) { $Int32Unit = 4294967296; // 2^32 $length = strlen($Str); for ($i = 0; $i < $length; $i++) { $Check *= $Magic; if ($Check >= $Int32Unit) { $Check = ($Check - $Int32Unit * (int) ($Check / $Int32Unit)); //if the check less than -2^31 $Check = ($Check < -2147483648) ? ($Check + $Int32Unit) : $Check; } $Check += ord($Str{$i}); } return $Check; } //genearate a hash for a url function HashURL($String) { $Check1 = StrToNum($String, 0x1505, 0x21); $Check2 = StrToNum($String, 0, 0x1003F); $Check1 >>= 2; $Check1 = (($Check1 >> 4) & 0x3FFFFC0 ) | ($Check1 & 0x3F); $Check1 = (($Check1 >> 4) & 0x3FFC00 ) | ($Check1 & 0x3FF); $Check1 = (($Check1 >> 4) & 0x3C000 ) | ($Check1 & 0x3FFF); $T1 = (((($Check1 & 0x3C0) < < 4) | ($Check1 & 0x3C)) <<2 ) | ($Check2 & 0xF0F ); $T2 = (((($Check1 & 0xFFFFC000) << 4) | ($Check1 & 0x3C00)) << 0xA) | ($Check2 & 0xF0F0000 ); return ($T1 | $T2); } //genearate a checksum for the hash string function CheckHash($Hashnum) { $CheckByte = 0; $Flag = 0; $HashStr = sprintf('%u', $Hashnum) ; $length = strlen($HashStr); for ($i = $length - 1; $i >= 0; $i --) { $Re = $HashStr{$i}; if (1 === ($Flag % 2)) { $Re += $Re; $Re = (int)($Re / 10) + ($Re % 10); } $CheckByte += $Re; $Flag ++; } $CheckByte %= 10; if (0 !== $CheckByte) { $CheckByte = 10 - $CheckByte; if (1 === ($Flag % 2) ) { if (1 === ($CheckByte % 2)) { $CheckByte += 9; } $CheckByte >>= 1; } } return '7'.$CheckByte.$HashStr; }

scrape the ape three : sidebar zoo

“Scrape the Ape” plugin.

i ripped the widget code framework from Marcel Proulx http://www.district30.net and changed it a bit so it runs my Ape-Scrape. I’ll break my brains on the specifics of widget-codes later.

/*
Plugin Name: Scrape the Ape
Plugin URI: https://juust.org/
Description: How to scrape a zoo from Flickr onto the sidebar.
Version: 0.1
Author: Lord of Apes
Author URI: https://juust.org/
*/

function disp_apes( $args, $widget_args = 1 ) {
	extract( $args, EXTR_SKIP );
	if ( is_numeric($widget_args) )
		$widget_args = array( 'number' => $widget_args );
	$widget_args = wp_parse_args( $widget_args, array( 'number' => -1 ) );
	extract( $widget_args, EXTR_SKIP );

	// Data should be stored as array:  array( number => data for that instance of the widget, ... )
	$options = get_option('widget_apes');
	if ( !isset($options[$number]) )
		return;

	extract($options[$number], EXTR_SKIP);

//this is the scraper bit :

	$flikker = join("",file("http://api.flickr.com/services/feeds/photos_public.gne?tags=".$apes_tags."&format=rss"));
	$flikkerhits = preg_split('/img src="\;/', $flikker, -1, PREG_SPLIT_OFFSET_CAPTURE);
	foreach($flikkerhits as $flikkerhit){
	    $i++;
	    if($i>1) $apes[]=substr($flikkerhit[0], 0, strpos($flikkerhit[0], 'width')-7);
	}
	$strgrid = "";
	$currentcol=1;
	$currentrow=1;
	$columns=$apes_cols;
	$rows=$apes_rows;
	$strgrid .= "
"; for($a=0;$a"; $strgrid .= ""; $strgrid .= ""; $currentcol++; if($currentcol>$columns) { $currentcol=1; $strgrid .= " "; $currentrow++; if($currentrow>$rows) { break; } else { $strgrid .= " "; } } } $strgrid .= "
"; echo $strgrid; } function apes_control($widget_args) { global $wp_registered_widgets; static $updated = false; // have we already updated the data after a POST submit if ( is_numeric($widget_args) ) $widget_args = array( 'number' => $widget_args ); $widget_args = wp_parse_args( $widget_args, array( 'number' => -1 ) ); extract( $widget_args, EXTR_SKIP ); // Data should be stored as array: array( number => data for that instance of the widget, ... ) $options = get_option('widget_apes'); if ( !is_array($options) ) $options = array(); // We need to update the data if ( !$updated && !empty($_POST['sidebar']) ) { // Tells us what sidebar to put the data in $sidebar = (string) $_POST['sidebar']; $sidebars_widgets = wp_get_sidebars_widgets(); if ( isset($sidebars_widgets[$sidebar]) ) $this_sidebar =& $sidebars_widgets[$sidebar]; else $this_sidebar = array(); foreach ( $this_sidebar as $_widget_id ) { // Remove all widgets of this type from the sidebar. We'll add the new data in a second. This makes sure we don't get any duplicate data // since widget ids aren't necessarily persistent across multiple updates if ( 'disp_apes' == $wp_registered_widgets[$_widget_id]['callback'] && isset($wp_registered_widgets[$_widget_id]['params'][0]['number']) ) { $widget_number = $wp_registered_widgets[$_widget_id]['params'][0]['number']; if ( !in_array( "apes-$widget_number", $_POST['widget-id'] ) ) // the widget has been removed. "many-$widget_number" is "{id_base}-{widget_number} unset($options[$widget_number]); } } foreach ( (array) $_POST['apes'] as $widget_number => $widget_apes_instance ) { // compile data from $widget_random_image_instance $apes_tags=strip_tags(stripslashes( $widget_apes_instance['apes_tags'])); $apes_rows=strip_tags(stripslashes( $widget_apes_instance['apes_rows'])); $apes_cols=strip_tags(stripslashes( $widget_apes_instance['apes_cols'])); $options[$widget_number] = compact('apes_tags', 'apes_cols', 'apes_rows'); } update_option('widget_apes', $options); $updated = true; // So that we don't go through this more than once } // Here we echo out the form if ( -1 == $number ) { // We echo out a template for a form which can be converted to a specific form later via JS $apes_tags = 'apes'; $apes_cols = 3; $apes_rows = 2; $number = '%i%'; } else { $apes_tags = attribute_escape($options[$number]['apes_tags']); $apes_cols=$options[$number]['apes_cols']; $apes_rows=attribute_escape($options[$number]['apes_rows']); } // The form has inputs with names like widget-many[$number][something] so that all data for that instance of // the widget are stored in one $_POST variable: $_POST['widget-many'][$number] ?> < ?php } /* Function: apes_register ** ** Registers the apes widgets with the widget page ** ** args: none ** returns: nothing */ function apes_register() { if ( !$options = get_option('widget_apes') ) $options = array(); $widget_ops = array('classname' => 'widget_apes', 'description' => __('Displays Flickr Apes')); $control_ops = array('apes_cols' => 3, 'apes_rows' => 2, 'apes_tags' => 'apes', 'id_base' => 'apes'); $name = __('Scrape the Apes'); $registered = false; foreach ( array_keys($options) as $o ) { // Old widgets can have null values for some reason if ( !isset($options[$o]['apes_tags']) ) continue; // $id should look like {$id_base}-{$o} $id = "apes-$o"; // Never never never translate an id $registered = true; wp_register_sidebar_widget( $id, $name, 'disp_apes', $widget_ops, array( 'number' => $o ) ); wp_register_widget_control( $id, $name, 'apes_control', $control_ops, array( 'number' => $o ) ); } // If there are none, we register the widget's existance with a generic template if ( !$registered ) { wp_register_sidebar_widget( 'apes-1', $name, 'disp_apes', $widget_ops, array( 'number' => -1 ) ); wp_register_widget_control( 'apes-1', $name, 'apes_control', $control_ops, array( 'number' => -1 ) ); } } // This is important add_action( 'widgets_init', 'apes_register' )

do not expect to see this on wordpress.org ;) when i figure out how the widget code works i’ll make something proper.