proxies !

I got a site banned at Google so I got pissed and took a script from the blackbox @ digerati marketing to scrape proxy addresses, wired a database and curl into it, so now it scrapes proxies, random picks a proxy, prunes dead proxies and returns data.

Basic, it uses anonymous (level 2) proxies, but it works.


/* (mysql table)
CREATE TABLE IF NOT EXISTS `serp_proxies` (
  `id` int(11) NOT NULL auto_increment,
  `ip` text NOT NULL,
  `port` text NOT NULL,
  PRIMARY KEY  (`id`)
) ENGINE=MyISAM  DEFAULT CHARSET=latin1 AUTO_INCREMENT=1 ;
*/

//initialize database class, replace with own code
include('init.php');

//main class
$p=new MyProxies;

//do I have proxies in the database ?
//if not, get some and store them
if($p->GetCount() < 1) {
	$p->GetSomeAir(1);
	$p->store2database();
}

//pick one
$p->RandomProxy();

//get the page
$p->ThisProxy->DoRequest('http://www.domain.com/robots.txt');

//error handling
if($p->ThisProxy->ProxyError > 0) {
//7 		no connect
//28 		timed out
//52 		empty reply
//if it is dead, doesn't allow connections : prune it
	if($p->ThisProxy->ProxyError==7) $p->DeleteProxy($p->ThisProxy->proxy_ip);
	if($p->ThisProxy->ProxyError==52) $p->DeleteProxy($p->ThisProxy->proxy_ip);
}
//you could loop back until you get a 0-error proxy, but that ain't the point

//give me the content
echo $p->ThisProxy->Content;


Class MyProxies {

	var $Proxies = array();
	var $ThisProxy;
	var $MyCount;
	

//picks a random proxy from the database
	function RandomProxy() {

		global $serpdb;	
		$offset_result =  $serpdb->query("SELECT FLOOR(RAND() * COUNT(*)) AS `offset` FROM `serp_proxies`");
		$offset_row = mysql_fetch_object($offset_result);
		$offset = $offset_row->offset;
		$result = $serpdb->query("SELECT * FROM `serp_proxies` LIMIT $offset, 1" );
		while($row=mysql_fetch_assoc($result)) {
//make instance of Proxy, with proxy_host ip and port
			$this->ThisProxy = new Proxy($row['ip'].':'.$row['port']);
			$this->ThisProxy->proxy_ip = $row['ip'];
			$this->ThisProxy->proxy_port = $row['port'];
			break;
		}
	}
	
//visit the famous russian site 
	function GetSomeAir($pages) {
			for($index=0; $index< $pages; $index++)
			{
				$pageno = sprintf("%02d",$index+1); 
				$page_url = "http://www.samair.ru/proxy/proxy-" . $pageno . ".htm";
				$page_html = @file_get_contents($page_url);

//get rid of the crap and extract the proxies
				preg_match("/(.*)< \/td>< \/tr>/", $page_html, $matches);
				$txt = $matches[1];
				$main = split('', $txt);
				for($x=0;$x', $main[$x]);
					$this->Proxies[] = split(':', $arr[0]);
				}
			}
	}

//store the retrieved proxies (stored in this->Proxies) in the database
	function store2database() {
		global $serpdb;
		foreach($this->Proxies as $p) { 
			$result = $serpdb->query("SELECT * FROM serp_proxies WHERE ip='".$p[0]."'");
			if(mysql_num_rows($result)<1) $serpdb->query("INSERT INTO serp_proxies (`ip`, `port`) VALUES ('".$p[0]."', '".$p[1]."')");
		}
		$serpdb->query("DELETE FROM serp_proxies WHERE `ip`=''");
	}


	function DeleteProxy($ip) {
		global $serpdb;
		$serpdb->query("DELETE FROM serp_proxies WHERE `ip`='".$ip."'");			
	}
	
	
	function GetCount() 
	{
//use this to check how many proxies there are in the database
		global $serpdb;
		$this->MyCount = mysql_num_rows($serpdb->query("SELECT * FROM `serp_proxies`"));
		return $this->MyCount; 
	}
	
	
}

Class Proxy {

	var $proxy_ip;
	var $proxy_port;
	
	var $proxy_host;
	var $proxy_auth; 
	var $ch;
	var $Content;
	var $USERAGENT = "Mozilla/4.0 (compatible; MSIE 5.01; Windows NT 5.0)";
	var $ProxyError = 0;
	var $ProxyErrorMsg = '';
	var $TimeOut=3;
	var $IncludeHeaders = 0;
	
	function Proxy($host, $username='', $pwd='') {
//initialize class, set host 
         $this->proxy_host = $host;
         if (strlen($username) > 0 || strlen($pwd) > 0) {
            $this->proxy_auth = $username.":".$pwd;
         }
      }

	function CURL_PROXY($cc) {
			if (strlen($this->proxy_host) > 0) {
				curl_setopt($cc, CURLOPT_PROXY, $this->proxy_host);
				if (strlen($this->proxy_auth) > 0)
					curl_setopt($cc, CURLOPT_PROXYUSERPWD, $this->proxy_auth);
			}
	}
	
	function DoRequest($url) {
		$this->ch = curl_init();
		curl_setopt($this->ch, CURLOPT_URL,$url);
		$this->CURL_PROXY($this->ch);
		curl_setopt($this->ch, CURLOPT_HEADER, $this->IncludeHeaders); // baca header
		
		curl_setopt($this->ch, CURLOPT_USERAGENT, $this->USERAGENT);
		curl_setopt($this->ch, CURLOPT_RETURNTRANSFER, 1);
		curl_setopt($this->ch, CURLOPT_TIMEOUT, $this->TimeOut);
	    $this->Content = curl_exec($this->ch);

//if an error occurs, store the number and message
		if (curl_errno($this->ch))
			{ 
				$this->ProxyError =  curl_errno($this->ch);
				$this->ProxyErrorMsg =  curl_error($this->ch);
			}
	}

}

There is not much to say about it, just a rough outline. I would prefer elite level 1 proxies but for now it will have to do.

Posted in php, seo tips and tricks and tagged , , .

Leave a Reply

Your email address will not be published. Required fields are marked *