当前位置 : 主页 > 网络编程 > PHP >

PHP爬取360手机助手网页版数据

来源:互联网 收集:自由互联 发布时间:2021-06-28
gistfile1.txt '经营策略', 54='棋牌天地', 102238='儿童游戏', 101587='角色扮演', 19='休闲益智', 20='动作冒险', 100451='网络游戏', 51='体育竞速', 52='飞行射击'];public function __construct(){$link = mysql_co
gistfile1.txt
 '经营策略', 54=>'棋牌天地', 102238=>'儿童游戏', 101587=>'角色扮演', 19=>'休闲益智', 20=>'动作冒险', 100451=>'网络游戏', 51=>'体育竞速', 52=>'飞行射击'];


	public function __construct()
	{
		$link = mysql_connect('localhost' ,  'market' ,  'AW_d4au_jA3hH21212');
        if (!$link) {
            echo 'connect failed......';
            die;
        }
        mysql_select_db('market');
	}

	public function getPages()
	{
		$pages = [];
		foreach ($this->cateId as $key=>$value) {
			foreach ($this->url as $url) {
				for ($i=1; $i<=50; $i++) { 
					$pages[] = self::BASEURL.'/'.$key.'/'.$url.'/?page='.$i;  // 有分类id
				}
			}
	    }
		return $pages;
	}

	// 获取产品的sid
	public function getSids()
	{
		set_time_limit(0);
		$sids = array();
		$pages = $this->getPages();  // 有分类id
		foreach ($pages as $page) {
			$posStart = strpos($page, 'cid/') + 4;
      		$posEnd   = strpos($page, '/order');
      		$cid = substr($page, $posStart, $posEnd-$posStart);  // 分类id
			$htmlData = file_get_contents($page);
			preg_match_all('/

(.*?)<\/a><\/h3>/ism', $htmlData, $sidArr); foreach ($sidArr[1] as $sid) { // $sids[] = $sid; // $sid作为键 $cid作为值 $sids["$sid"] = $cid; } } return $sids; } // 获取详情页数据 public function getData() { $sids = $this->getSids(); foreach ($sids as $sid=>$cid) { $detailUrl = 'http://zhushou.360.cn/detail/index/soft_id/'.$sid; $htmlData = file_get_contents($detailUrl); // 在360市场中的id $b_id = $sid; // 名称和图标 图标要保存到服务器 preg_match_all('/
<\/dt>/ism', $htmlData, $iconArr); $name = $iconArr[2][0]; $icon = $iconArr[1][0]; // 保存到服务器 $icon = $this->saveIconToLocal($icon); // 版本号 preg_match_all('/版本:<\/strong>(.*?)<\/td>/ism', $htmlData, $versionArr); $version = strip_tags($versionArr[1][0]); // 下载和size preg_match_all('/(.*?)<\/span>/ism', $htmlData, $downArr); $size = strip_tags($downArr[0][1]); $count = strip_tags($downArr[0][0]); $count = $this->getCount($count); // 处理后的下载量 // 下载链接 preg_match_all('/ .*?<\/a>/ism', $htmlData, $urlArr); $urlStr = $urlArr[1][0]; $url = substr($urlStr, strrpos($urlStr, 'http')); // 包名 $posStart = strrpos($url, 'com'); $posEnd = strrpos($url, '_'); $package = substr($url, $posStart, $posEnd-$posStart); // 获取介绍 preg_match_all('/ .*?<\/div>/is', $htmlData, $introArr); $introStr = $introArr[0][0]; // 这个也要处理 $pos = strpos($introStr, 'base-info'); $intro = trim(strip_tags(html_entity_decode(substr($introStr, 0, $pos-12)))); // 展示图 $screen = $this->getScreen($htmlData); // 处理分类 switch ($cid) { case 19: // 休闲益智 $cateid = 11; break; case 20: // 动作冒险 $cateid = 12; break; case 51: // 体育竞速 $cateid = 34; break; case 52: // 飞行射击 $cateid = 12; break; case 53: // 经营策略 $cateid = 32; break; case 54: // 棋牌天地 $cateid = 13; break; case 102238: // 儿童游戏 $cateid = 11; break; case 101587: // 角色扮演 $cateid = 33; break; case 100451: // 网络游戏 $cateid = 14; break; } $uptime = date('Y-m-d H:i:s', time()); $sql = "SELECT id FROM market_product WHERE package_apk='".$package."'"; // echo $sql.'
';die; $res = mysql_query($sql); if (!$res['id']) { $sql = 'INSERT INTO market_product(name, icon, size, downurl, downnum, screenshots, version, intro, package_apk, b_id, cateid, uptime, score) values ("%s", "%s", "%s", "%s", %s, "%s", "%s", "%s", "%s", %s, %s, "%s", %d)'; $sql = sprintf($sql, $name, $icon, $size, $url, $count, $screen, $version, $intro, $package, $b_id, $cateid, $uptime, 4); mysql_query($sql); } } } // 处理下载量 public function getCount($count) { $numStr = substr($count, 9, -3); $unit = substr($numStr, -3); if ($unit == '万') { $num = substr($numStr, 0, -3)*10000; } elseif ($unit == '亿') { $num = substr($numStr, 0, -3)*100000000; } else { return $numStr; } return $num; } // 处理详情图 public function getScreen($htmlData) { preg_match_all('/ (.*?)<\/p>/ism', $htmlData, $screenArr); if (!count($screenArr)) { preg_match_all('/

(.*?)<\/p>/ism', $htmlData, $screenArr); } $screenStr = implode(',', $screenArr[1]); preg_match_all('//ism', $screenStr, $screen); $screen = implode(',', $screen[1]); } return $screen; } public function saveIconToLocal($icon) { // todo code } } $spider = new Spider(); $spider->getData();

网友评论