I got a site banned at Google so I got pissed and took a script from the blackbox @ digerati marketing to scrape proxy addresses, wired a database and curl into it, so now it scrapes proxies, random picks a proxy, prunes dead proxies and returns data.
Basic, it uses anonymous (level 2) proxies, but it works.
-
-
/* (mysql table)
-
CREATE TABLE IF NOT EXISTS `serp_proxies` (
-
`id` int(11) NOT NULL auto_increment,
-
`ip` text NOT NULL,
-
`port` text NOT NULL,
-
PRIMARY KEY (`id`)
-
) ENGINE=MyISAM DEFAULT CHARSET=latin1 AUTO_INCREMENT=1 ;
-
*/
-
-
//initialize database class, replace with own code
-
include('init.php');
-
-
//main class
-
$p=new MyProxies;
-
-
//do I have proxies in the database ?
-
//if not, get some and store them
-
if($p->GetCount() < 1) {
-
$p->GetSomeAir(1);
-
$p->store2database();
-
}
-
-
//pick one
-
$p->RandomProxy();
-
-
//get the page
-
$p->ThisProxy->DoRequest('http://www.domain.com/robots.txt');
-
-
//error handling
-
if($p->ThisProxy->ProxyError > 0) {
-
//7 no connect
-
//28 timed out
-
//52 empty reply
-
//if it is dead, doesn't allow connections : prune it
-
if($p->ThisProxy->ProxyError==7) $p->DeleteProxy($p->ThisProxy->proxy_ip);
-
if($p->ThisProxy->ProxyError==52) $p->DeleteProxy($p->ThisProxy->proxy_ip);
-
}
-
//you could loop back until you get a 0-error proxy, but that ain't the point
-
-
//give me the content
-
echo $p->ThisProxy->Content;
-
-
-
Class MyProxies {
-
-
var $Proxies = array();
-
var $ThisProxy;
-
var $MyCount;
-
-
-
//picks a random proxy from the database
-
function RandomProxy() {
-
-
global $serpdb;
-
$offset_result = $serpdb->query("SELECT FLOOR(RAND() * COUNT(*)) AS `offset` FROM `serp_proxies`");
-
$offset_row = mysql_fetch_object($offset_result);
-
$offset = $offset_row->offset;
-
$result = $serpdb->query("SELECT * FROM `serp_proxies` LIMIT $offset, 1" );
-
while($row=mysql_fetch_assoc($result)) {
-
//make instance of Proxy, with proxy_host ip and port
-
$this->ThisProxy = new Proxy($row['ip'].':'.$row['port']);
-
$this->ThisProxy->proxy_ip = $row['ip'];
-
$this->ThisProxy->proxy_port = $row['port'];
-
break;
-
}
-
}
-
-
//visit the famous russian site
-
function GetSomeAir($pages) {
-
for($index=0; $index< $pages; $index++)
-
{
-
$pageno = sprintf("%02d",$index+1);
-
$page_url = "http://www.samair.ru/proxy/proxy-" . $pageno . ".htm";
-
$page_html = @file_get_contents($page_url);
-
-
//get rid of the crap and extract the proxies
-
preg_match("/<tr><td>(.*)< \/td>< \/tr>/", $page_html, $matches);
-
$txt = $matches[1];
-
$main = split('</td><tr><td>', $txt);
-
for($x=0;$x<count ($main);$x++) {
-
$arr = split('</td><td>', $main[$x]);
-
$this->Proxies[] = split(':', $arr[0]);
-
}
-
}
-
}
-
-
//store the retrieved proxies (stored in this->Proxies) in the database
-
function store2database() {
-
global $serpdb;
-
foreach($this->Proxies as $p) {
-
$result = $serpdb->query("SELECT * FROM serp_proxies WHERE ip='".$p[0]."'");
-
if(mysql_num_rows($result)<1) $serpdb->query("INSERT INTO serp_proxies (`ip`, `port`) VALUES ('".$p[0]."', '".$p[1]."')");
-
}
-
$serpdb->query("DELETE FROM serp_proxies WHERE `ip`=''");
-
}
-
-
-
function DeleteProxy($ip) {
-
global $serpdb;
-
$serpdb->query("DELETE FROM serp_proxies WHERE `ip`='".$ip."'");
-
}
-
-
-
function GetCount()
-
{
-
//use this to check how many proxies there are in the database
-
global $serpdb;
-
$this->MyCount = mysql_num_rows($serpdb->query("SELECT * FROM `serp_proxies`"));
-
return $this->MyCount;
-
}
-
-
-
}
-
-
Class Proxy {
-
-
var $proxy_ip;
-
var $proxy_port;
-
-
var $proxy_host;
-
var $proxy_auth;
-
var $ch;
-
var $Content;
-
var $USERAGENT = "Mozilla/4.0 (compatible; MSIE 5.01; Windows NT 5.0)";
-
var $ProxyError = 0;
-
var $ProxyErrorMsg = '';
-
var $TimeOut=3;
-
var $IncludeHeaders = 0;
-
-
function Proxy($host, $username='', $pwd='') {
-
//initialize class, set host
-
$this->proxy_host = $host;
-
if (strlen($username) > 0 || strlen($pwd) > 0) {
-
$this->proxy_auth = $username.":".$pwd;
-
}
-
}
-
-
function CURL_PROXY($cc) {
-
if (strlen($this->proxy_host) > 0) {
-
curl_setopt($cc, CURLOPT_PROXY, $this->proxy_host);
-
if (strlen($this->proxy_auth) > 0)
-
curl_setopt($cc, CURLOPT_PROXYUSERPWD, $this->proxy_auth);
-
}
-
}
-
-
function DoRequest($url) {
-
$this->ch = curl_init();
-
curl_setopt($this->ch, CURLOPT_URL,$url);
-
$this->CURL_PROXY($this->ch);
-
curl_setopt($this->ch, CURLOPT_HEADER, $this->IncludeHeaders); // baca header
-
-
curl_setopt($this->ch, CURLOPT_USERAGENT, $this->USERAGENT);
-
curl_setopt($this->ch, CURLOPT_RETURNTRANSFER, 1);
-
curl_setopt($this->ch, CURLOPT_TIMEOUT, $this->TimeOut);
-
$this->Content = curl_exec($this->ch);
-
-
//if an error occurs, store the number and message
-
if (curl_errno($this->ch))
-
{
-
$this->ProxyError = curl_errno($this->ch);
-
$this->ProxyErrorMsg = curl_error($this->ch);
-
}
-
}
-
-
}
-
</td></count></td></tr>
There is not much to say about it, just a rough outline. I would prefer elite level 1 proxies but for now it will have to do.