juust ~ php oddities

Unordered list of one element
  • rss
  • begin
  • about
    • vcard
    • WTF is BroJesus
  • php scripts
    • flickr wp widget
    • google multi key serp tool, php script
    • gwt plugin
  • php classes
    • php pagerank class
    • fibonacci class
    • robots.txt parser php class
  • serp
    • serp dashboard wordpress plugin
  • services

proxies !

juust | 21/02/2009

I got a site banned at Google so I got pissed and took a script from the blackbox @ digerati marketing to scrape proxy addresses, wired a database and curl into it, so now it scrapes proxies, random picks a proxy, prunes dead proxies and returns data.

Basic, it uses anonymous (level 2) proxies, but it works. You can check the source here

  1.  
  2. /* (mysql table)
  3. CREATE TABLE IF NOT EXISTS `serp_proxies` (
  4.   `id` int(11) NOT NULL auto_increment,
  5.   `ip` text NOT NULL,
  6.   `port` text NOT NULL,
  7.   PRIMARY KEY  (`id`)
  8. ) ENGINE=MyISAM  DEFAULT CHARSET=latin1 AUTO_INCREMENT=1 ;
  9. */
  10.  
  11. //initialize database class, replace with own code
  12. include('init.php');
  13.  
  14. //main class
  15. $p=new MyProxies;
  16.  
  17. //do I have proxies in the database ?
  18. //if not, get some and store them
  19. if($p->GetCount() < 1) {
  20.  $p->GetSomeAir(1);
  21.  $p->store2database();
  22. }
  23.  
  24. //pick one
  25. $p->RandomProxy();
  26.  
  27. //get the page
  28. $p->ThisProxy->DoRequest('http://www.domain.com/robots.txt');
  29.  
  30. //error handling
  31. if($p->ThisProxy->ProxyError > 0) {
  32. //7   no connect
  33. //28   timed out
  34. //52   empty reply
  35. //if it is dead, doesn't allow connections : prune it
  36.  if($p->ThisProxy->ProxyError==7) $p->DeleteProxy($p->ThisProxy->proxy_ip);
  37.  if($p->ThisProxy->ProxyError==52) $p->DeleteProxy($p->ThisProxy->proxy_ip);
  38. }
  39. //you could loop back until you get a 0-error proxy, but that ain't the point
  40.  
  41. //give me the content
  42. echo $p->ThisProxy->Content;
  43.  
  44.  
  45. Class MyProxies {
  46.  
  47.  var $Proxies = array();
  48.  var $ThisProxy;
  49.  var $MyCount;
  50.  
  51.  
  52. //picks a random proxy from the database
  53.  function RandomProxy() {
  54.  
  55.   global $serpdb;
  56.   $offset_result =  $serpdb->query("SELECT FLOOR(RAND() * COUNT(*)) AS `offset` FROM `serp_proxies`");
  57.   $offset_row = mysql_fetch_object($offset_result);
  58.   $offset = $offset_row->offset;
  59.   $result = $serpdb->query("SELECT * FROM `serp_proxies` LIMIT $offset, 1" );
  60.   while($row=mysql_fetch_assoc($result)) {
  61. //make instance of Proxy, with proxy_host ip and port
  62.    $this->ThisProxy = new Proxy($row['ip'].':'.$row['port']);
  63.    $this->ThisProxy->proxy_ip = $row['ip'];
  64.    $this->ThisProxy->proxy_port = $row['port'];
  65.    break;
  66.   }
  67.  }
  68.  
  69. //visit the famous russian site
  70.  function GetSomeAir($pages) {
  71.    for($index=0; $index< $pages; $index++)
  72.    {
  73.     $pageno = sprintf("%02d",$index+1);
  74.     $page_url = "http://www.samair.ru/proxy/proxy-" . $pageno . ".htm";
  75.     $page_html = @file_get_contents($page_url);
  76.  
  77. //get rid of the crap and extract the proxies
  78.     preg_match("/<tr><td>(.*)< \/td>< \/tr>/", $page_html, $matches);
  79.     $txt = $matches[1];
  80.     $main = split('</td><tr><td>', $txt);
  81.     for($x=0;$x<count ($main);$x++) {
  82.      $arr = split('</td><td>', $main[$x]);
  83.      $this->Proxies[] = split(':', $arr[0]);
  84.     }
  85.    }
  86.  }
  87.  
  88. //store the retrieved proxies (stored in this->Proxies) in the database
  89.  function store2database() {
  90.   global $serpdb;
  91.   foreach($this->Proxies as $p) {
  92.    $result = $serpdb->query("SELECT * FROM serp_proxies WHERE ip='".$p[0]."'");
  93.    if(mysql_num_rows($result)&lt;1) $serpdb->query("INSERT INTO serp_proxies (`ip`, `port`) VALUES ('".$p[0]."', '".$p[1]."')");
  94.   }
  95.   $serpdb->query("DELETE FROM serp_proxies WHERE `ip`=''");
  96.  }
  97.  
  98.  
  99.  function DeleteProxy($ip) {
  100.   global $serpdb;
  101.   $serpdb->query("DELETE FROM serp_proxies WHERE `ip`='".$ip."'");  
  102.  }
  103.  
  104.  
  105.  function GetCount()
  106.  {
  107. //use this to check how many proxies there are in the database
  108.   global $serpdb;
  109.   $this->MyCount = mysql_num_rows($serpdb->query("SELECT * FROM `serp_proxies`"));
  110.   return $this->MyCount;
  111.  }
  112.  
  113.  
  114. }
  115.  
  116. Class Proxy {
  117.  
  118.  var $proxy_ip;
  119.  var $proxy_port;
  120.  
  121.  var $proxy_host;
  122.  var $proxy_auth;
  123.  var $ch;
  124.  var $Content;
  125.  var $USERAGENT = "Mozilla/4.0 (compatible; MSIE 5.01; Windows NT 5.0)";
  126.  var $ProxyError = 0;
  127.  var $ProxyErrorMsg = '';
  128.  var $TimeOut=3;
  129.  var $IncludeHeaders = 0;
  130.  
  131.  function Proxy($host, $username='', $pwd='') {
  132. //initialize class, set host
  133.          $this->proxy_host = $host;
  134.          if (strlen($username) > 0 || strlen($pwd) > 0) {
  135.             $this->proxy_auth = $username.":".$pwd;
  136.          }
  137.       }
  138.  
  139.  function CURL_PROXY($cc) {
  140.    if (strlen($this->proxy_host) > 0) {
  141.     curl_setopt($cc, CURLOPT_PROXY, $this->proxy_host);
  142.     if (strlen($this->proxy_auth) > 0)
  143.      curl_setopt($cc, CURLOPT_PROXYUSERPWD, $this->proxy_auth);
  144.    }
  145.  }
  146.  
  147.  function DoRequest($url) {
  148.   $this->ch = curl_init();
  149.   curl_setopt($this->ch, CURLOPT_URL,$url);
  150.   $this->CURL_PROXY($this->ch);
  151.   curl_setopt($this->ch, CURLOPT_HEADER, $this->IncludeHeaders); // baca header
  152.  
  153.   curl_setopt($this->ch, CURLOPT_USERAGENT, $this->USERAGENT);
  154.   curl_setopt($this->ch, CURLOPT_RETURNTRANSFER, 1);
  155.   curl_setopt($this->ch, CURLOPT_TIMEOUT, $this->TimeOut);
  156.      $this->Content = curl_exec($this->ch);
  157.  
  158. //if an error occurs, store the number and message
  159.   if (curl_errno($this->ch))
  160.    {
  161.     $this->ProxyError =  curl_errno($this->ch);
  162.     $this->ProxyErrorMsg =  curl_error($this->ch);
  163.    }
  164.  }
  165.  
  166. }
  167. </td></count></td></tr>

There is not much to say about it, just a rough outline. I would prefer elite level 1 proxies but for now it will have to do.

Categories
php, seo tips and tricks
Tags
php, scrape, seo tips and tricks
Comments rss
Comments rss
Trackback
Trackback

« using ajax readystate 3 polling trackbacks »

Leave a Reply

Click here to cancel reply.

Recent Posts

  • Pagerank sculpting session
  • wish you were here
  • interesting : seo panel
  • availability test
  • Mayday

click me!
rss
Comments rss
Blog Directory
Web Developement Blogs - BlogCatalog Blog Directory
Listed in LS Blogs the Blog Directory and Blog Search Engine
Blog Flux Directory
joopita.com free web directory and search engine
design by jide
sitemap
17240 confirmed spam kills