当前位置 : 主页 > 网络编程 > PHP >

PHP特定爬虫程序备份

来源:互联网 收集:自由互联 发布时间:2021-06-30
?PHPerror_reporting(E_ALL^E_NOTICE);$conn = @ mysql_connect("localhost", "ruzhouren", "ruzhourenmysql");mysql_select_db("ruzhouren", $conn);mysql_query("set names 'utf8'"); date_default_timezone_set("ETC/GMT-8");$day_date=date('Y-m-d');$tim
 
<?PHP
error_reporting(E_ALL^E_NOTICE);
$conn = @ mysql_connect("localhost", "ruzhouren", "ruzhourenmysql");
mysql_select_db("ruzhouren", $conn);
mysql_query("set names 'utf8'"); 
date_default_timezone_set("ETC/GMT-8");
$day_date=date('Y-m-d');
$time_date=date('Y-m-d H:i:s');
$time_dateline=time();
//$start_num_query=mysql_query("SELECT * FROM `rzr_wsq_rzbst_num` where `id`= 1 ");
$start_num=mysql_fetch_array($start_num_query);
  
function DeleteHtml($str) { 
        $str = trim($str); 
        $str = preg_replace("/\\t/","",$str); 
        $str = preg_replace("/\\r\\n/","",$str); 
        $str = preg_replace("/\\r/","",$str); 
        $str = preg_replace("/\\n/","",$str); 
        $str = preg_replace("/ /","",$str);
        $str = preg_replace("/  /","",$str); 
        return trim($str); 
}
  
for($a=170;$a<1000000;$a++){
        $url = "http://www.rz0375.com/phonebook/all/pn{$a}/";
        $fp = @fopen($url, "r") or die("timeout");
        $fcontents = file_get_contents($url);
        $fcontents = iconv("GB2312//IGNORE", "UTF-8",trim($fcontents));
        @preg_match("/target=\\"_blank\\" >(.*)<\\/a><\\/span>/ms", $fcontents, $li_span);
        $exp_li_span = explode("<li>",$li_span[0]);
        foreach($exp_li_span as $span){
                $exp_http_1 = explode("http://",$span);
                if(count($exp_http_1) <3 ){
                        $exp_http_2 = explode("\\">",$exp_http_1[1]);
                }else{
                        $exp_http_2 = explode("\\" target=\\"_blank\\"",$exp_http_1[1]);
                }
                $url_con = "http://".$exp_http_2[0];
                $http = preg_replace('/www.rz0375.com\\/phonebook\\//','',$exp_http_2[0]);
                $fp = @fopen($url_con, "r");
                if(! $fp){
                    continue;
                }
                $fcontent_url = file_get_contents($url_con);
                $fcontent02 = iconv("GB2312//IGNORE", "UTF-8",trim($fcontent_url));
                @preg_match("/<div class=\\"con f14 fa\\">(.*)<\\/div>/mUs", $fcontent02, $url_contron);
                $contron_str = preg_replace('/<span class="ewm">(.*?)<\\/span>/','',$url_contron[1]);
                $contron_str = preg_replace('/<span class="yp">(.*?)<\\/span>/','',$contron_str);
                $contron_str_exp = explode("<br />",$contron_str);
                foreach($contron_str_exp as $li_exp_span){
                        $li_exp_span = DeleteHtml($li_exp_span);
                        $li_exp_sub = mb_substr($li_exp_span,0,2,'utf-8');
                        if($li_exp_sub == "名称"){
                                $li_span = strip_tags($li_exp_span);
                                $name = explode(":",$li_span);
                        }
                        if($li_exp_sub == "电话"){
                                $strip_str_phone = strip_tags($li_exp_span);
                                $phone = explode(":",$strip_str_phone);
                        }
                        if($li_exp_sub == "手机"){
                                $strip_str_tel = strip_tags($li_exp_span);
                                $tel = explode(":",$strip_str_tel);
                        }
                        if($li_exp_sub == "地址"){
                                $strip_str_addres = strip_tags($li_exp_span);
                                $addres = explode(":",$strip_str_addres);
                        }
                        if($li_exp_sub == "标签"){
                                $strip_str_tag = strip_tags($li_exp_span);
                                $tag = explode(":",$strip_str_tag);
                        }
                }
                $insert_rs = mysql_query("INSERT INTO `ruzhouren`.`rzrmh_tel` (`id`, `name`, `status`, `order`, `url`, `vip`, `tel`, `tel2`, `addr`, `add_uid`, `own_uid`, `add_dateline`, `type`, `http`) VALUES (NULL, '$name[1]', '1', '1', '0', '1', '$phone[1]', '$tel[1]', '$addres[1]', '1', '1', '$time_dateline', '0', '$http');");
                if($insert_rs){
                        echo "insert-----$name[1]  Succeed!"."\\n";
                }
                //print_r($contron_str_exp);
        }
        sleep(5);
          
        //print_r($pre_li);
        /* v1.0
        foreach($exp_li_span as $span){
                $ex_url = "<span class='yp'>";
                $tow_span = explode("$ex_url",$span);
                $string = preg_replace('/<span class="comp"><a(.*?)href="(.*?)" target="_blank" >/','',$tow_span[0]);
                $string = preg_replace('/<\\/a><\\/span>/','',$string);
                $string_ok = preg_replace('/target="_blank" >/','',$string);
                $span_2 = explode(".html",$tow_span[1]);
                if(count($span_2) >2 ){
                        $span_exp = $span_2[2];
                }else{
                        $span_exp = $span_2[1];
                }
                $string2 = preg_replace('/<\\/a><\\/span><\\/li>/','',$span_exp);
                $string2 = preg_replace('/<\\/a><\\/span>/','',$string2);
                $string2_ok = preg_replace('/">/','',$string2 );
                print $string_ok."------".$string2_ok;
        }*/
        sleep(10);
}
?>

网友评论