当前位置 : 主页 > 网络编程 > PHP >

抓取博客园文章列表

来源:互联网 收集:自由互联 发布时间:2021-07-03
?php/** * * @authors HG (hg0728@qq.com) * @date 2015-05-22 17:00:48 * @version 1.0 */header("Content-type:text/html;charset=utf-8");function getCurl($url) { $ch = curl_init(); curl_setopt($ch, CURLOPT_URL, $url); curl_setopt($ch, CURLOPT_RE
 
<?php
/**
 * 
 * @authors HG (hg0728@qq.com)
 * @date    2015-05-22 17:00:48
 * @version 1.0
 */
header("Content-type:text/html;charset=utf-8");
function getCurl($url) {
        $ch = curl_init();
        curl_setopt($ch, CURLOPT_URL, $url);
        curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
        curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false);
        curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, false);
        $result = curl_exec($ch);
        curl_close($ch);
        return $result;
}
  
function preg_list($str){//从curl获得指定内容
    $regex = '/<h3><a class="titlelnk" href="(.*?)" target="_blank">(.*?)<\\/a><\\/h3>/';
    $isMatched = preg_match_all($regex, $str, $matches);
  
    for ($i=0; $i < $isMatched; $i++) { 
        $str = $matches[1][$i] .' '. $matches[2][$i];
        echo $matches[1][$i];
        file_put_contents('blogs.txt', $str. "\\n", FILE_APPEND);
  
          
    }
}
for ($i=0; $i < 201; $i++) { //翻页抓取
    if($i==0){
        $url = 'http://www.cnblogs.com/';
        $str = getCurl($url);
    }
    else {
        $url = 'http://www.cnblogs.com/sitehome/p/'.$i;
        $str = getCurl($url);
    }
    preg_list($str);
}

网友评论