php content class
Just some stuff for a scraper.
Basic content class
input : table [content, url, keywords]
input : ,, + [wordcount, new content]
-
$art = new Article(html_entity_decode($row['content']), $row['url']);
-
if($art->words>200) {
-
$art->FoulFilter();
-
$art->tags_nofollow();
-
$art->cut_text(0.80);
-
$art->tribute($row['keywords']);
-
}
That outputs a nice clean article cut off at N%, with all links on nofollow and one follow link back to the authors page, all foul language replaced with ****. Nice and simple.
-
Class Article {
-
-
var $original;
-
var $url;
-
-
var $title;
-
var $content;
-
var $keywords;
-
-
var $length;
-
var $words;
-
var $anchors;
-
-
var $tags=array();
-
var $trackbacks=array();
-
-
var $perc=.85;
-
-
var $publishdate='';
-
var $publishperdate=4;
-
-
var $foul='foulword.txt';
-
var $newcontent;
-
-
public function __construct($source, $url) {
-
$this->url = $url;
-
$this->original = $source;
-
$this->content=$source;
-
$this->get_stats();
-
$this->get_title();
-
}
-
-
public function get_title() {
-
//extract the title from any h1/2/3 tags, or from the url
-
if(strpos($this->url,'?p=')>0) {
-
//check content for title
-
$this->title=preg_match('/<h1>(.*?)</h1>/', $this->content);
-
if(!$this->title) $this->$title=preg_match('/<h2>(.*?)< \/h2>/', $this->content);
-
if(!$this->title) $this->$title=preg_match('/<h3>(.*?)< \/h3>/', $this->content);
-
if(!$this->title) $this->$title = substr( $this->content, 0, 20);
-
} else {
-
$x=substr($this->url, 0, strlen($this->url)-1);
-
$p=strrpos($x, '/');
-
$filenm=substr($this->url, $p);
-
$filenm=preg_replace('/\//', '', $filenm);
-
$ttl=preg_replace('/-/', ' ', $filenm);
-
$this->title = $ttl.' - '.parse_url($this->url, PHP_URL_HOST);
-
}
-
}
-
-
public function get_stats() {\
-
//rough stats : number of words, content length, number of anchors
-
$this->words=count(explode(' ', $this->content));
-
$this->length=strlen($this->content);
-
preg_match_all('/http/', $this->content, $matches);
-
$this->anchors=count($matches);
-
}
-
-
-
public function FoulFilter(){
-
//remove foul language based on 'foulword.txt'
-
$stopWords = mb_split( '[ \n]+', mb_strtolower(file_get_contents($this->foul), 'utf-8' ) );
-
for($i=0;$i<count ($stopWords);$i++) {
-
if(trim($stopWords[$i])<>''){
-
$this->content=preg_replace('/ '.trim($stopWords[$i]).' /', ' ***** ',$this->content);
-
}
-
}
-
}
-
-
public function tags_nofollow() {
-
//put all anchors in content on nofollow
-
$this->content=preg_replace('/ href=/', ' target="_blank" rel="nofollow" href=', $this->content);
-
}
-
-
public function get_div() {
-
//extract the first div section of the content snippet
-
$this->content=substr($this->original, 0, strpos($this->content, 'div>'));
-
}
-
-
public function tribute($k) {
-
//add a tribute link to the original author
-
$this->content .= '><br /><br />[<a href="'.$this->url.'" title="'.$k.', original wordpress article">more...</a>]<br />';
-
}
-
-
-
public function cut_text($perc=0.85) {
-
//cut off article at N%
-
if(!$perc) $perc=$this->perc;
-
$this->content = substr($this->content, 0 , round(strlen($this->content) * $perc));
-
}
-
-
}
-
-
</count></h3></h2>
the actual posting should be done from a ‘blog’ class
and offer
- xmlrpc http posting
- ixr-wordpress
- table insert
- email blogging
etcetera






