建表语句:CREATE TABLE dy2008_url (id int(9) NOT NULL AUTO_INCREMENT, url varchar(2000) NOT NULL, status tinyint(2) NOT NULL, PRIMARY KEY(id)); 代码: ?php declare(ticks = 1); pcntl_signal(SIGQUIT, 'signal_handler'); pcntl_signal(SI
建表语句:CREATE TABLE dy2008_url (id int(9) NOT NULL AUTO_INCREMENT, url varchar(2000) NOT NULL, status tinyint(2) NOT NULL, PRIMARY KEY(id));
代码:
<?php declare(ticks = 1); pcntl_signal(SIGQUIT, 'signal_handler'); pcntl_signal(SIGTERM, 'signal_handler'); $crawlers_pid = array(); $finish_count = 0; //信号处理函数 function signal_handler($signal) { global $crawlers_pid; if ($signal == SIGQUIT || $signal == SIGTERM) { foreach ($crawlers_pid as $pid) { posix_kill($pid,SIGTERM); } echo "---------- crawl task exit ----------"; global $con;//mysql exit(); } } //GET方式获取链接对应页面内容 function get_page_content($url) { $content = file_get_contents($url); return $content; } //POST方式获取链接对应页面内容 function get_page_content_by_post($url, $arr) { $arr = http_build_query($arr); $opts = array ( 'http' => array('method' => 'POST', 'header' => 'Content-type:application/x-www-form-urlencoded'.' Content-Length:'.strlen($data).'"', 'content' => $data) ); $context = stream_context_create($opts); $content = file_get_contents($url,false,$context); return $content; } //dy2018抓取主流程 function run_dy2018() { global $crawlers_pid; global $finish_count; $crawl_urls = array("http://www.dy2018.com/html/tv/hytv/", "http://www.dy2018.com/html/tv/hepai/", "http://www.dy2018.com/html/tv/gangtai/", "http://www.dy2018.com/html/tv/oumeitv/", "http://www.dy2018.com/html/tv/rihantv/", "http://www.dy2018.com/html/tv/tvzz/", "http://www.dy2018.com/0/", "http://www.dy2018.com/1/", "http://www.dy2018.com/2/", "http://www.dy2018.com/3/", "http://www.dy2018.com/4/", "http://www.dy2018.com/5/", "http://www.dy2018.com/6/", "http://www.dy2018.com/7/", "http://www.dy2018.com/8/", "http://www.dy2018.com/9/", "http://www.dy2018.com/10/", "http://www.dy2018.com/11/", "http://www.dy2018.com/12/", "http://www.dy2018.com/13/", "http://www.dy2018.com/14/", "http://www.dy2018.com/15/", "http://www.dy2018.com/16/", "http://www.dy2018.com/17/", "http://www.dy2018.com/18/", "http://www.dy2018.com/19/", "http://www.dy2018.com/20/"); $i = 0; while($i < count($crawl_urls)) { $pid = pcntl_fork(); if($pid == -1) { echo "system error. check it now!"; exit(); } else if($pid > 0){ $crawlers_pid[$i] = $pid; } else { $url = $crawl_urls[$i]; $con = mysql_connect("localhost", "root", "123456"); if(!$con) { die('Count not connect: '.mysql_error()); } mysql_select_db("mysql", $con); crawl_process($url); $finish_count++; } $i++; } //pcntl_waitpid可能会导致信号监听失败 while (true) { if($finish_count == count($crawlers_pid)) { echo "---------- crawl task finish ----------"; mysql_close(); exit(); } sleep(1); } } //从入口链接到其下所有下载页链接抓取过程 function crawl_process($url) { echo "start handle url:".$url; $page_idx = 1; $valid_tag = true; $info_url_pattern = '/\/i\/\d+.html/'; $ftp_url_pattern = '/ftp:\/\/.*?.(swf|avi|flv|mpg|rm|mov|wav|asf|3gp|mkv|rmvb)/i';//^$两个符号不起作用 while($valid_tag) { $page_url = get_page_index_url($url, $page_idx); printf("start crawl url:".$page_url."\n"); $page_content = get_page_content($page_url); $valid_tag = is_valid_page($page_content); if($valid_tag) { $matches_urls = array(); preg_match_all($info_url_pattern, $page_content, $matches_urls); $page_content = mb_convert_encoding($page_content, "UTF-8", "GBK"); for($i=0; $i<count($matches_urls[0]); $i++) { $detail_url = 'http://www.dy2018.com'.$matches_urls[0][$i]; $detail_page_content = get_page_content($detail_url); $detail_page_content = mb_convert_encoding($detail_page_content, "UTF-8", "GBK"); preg_match_all($ftp_url_pattern, $detail_page_content, $ftp_urls); $ftp_links = array(); for($j=0;$j<count($ftp_urls[0]); $j++) { $ftp_links[$j] = $ftp_urls[0][$j]; } $ftp_links_unique = array_values(array_unique($ftp_links)); foreach ($ftp_links_unique as $ftp_link) { mysql_query("insert into dy2018_url (url, status) values('$ftp_link','0')"); // echo mysql_error();//打印mysql错误 } sleep(1); } } $page_idx++; } } //获取页码对应的url链接 function get_page_index_url($url, $idx) { $idx_url = $url; if($idx == 1) { $idx_url = $idx_url.'index.html'; } else if($idx > 1){ $idx_url = $idx_url.'index_'.$idx.'.html'; } return $idx_url; } //根据页面内容判断链接是否有效 function is_valid_page($content) { return $content?true:false; } run_dy2018(); mysql_close(); ?>