@version1.0 这个只能爬取静态的网站模版,代码逻辑需要优化。 正确的逻辑应该是从首页开始爬, 获取HTML,JS,IMAGE,CSS,然后从CSS中分析额外加载的CSS,最后分析所有的CSS中包含的图片引用
这个只能爬取静态的网站模版,代码逻辑需要优化。
正确的逻辑应该是从首页开始爬,
获取HTML,JS,IMAGE,CSS,然后从CSS中分析额外加载的CSS,最后分析所有的CSS中包含的图片引用。
<?php class NetworkReptiles { // 正则解析图片、JS、CSS、等资源文件 protected $href_patten = "/<a href=[\\'\\"]?([^\\'\\" ]+).*?>/"; protected $script_pattern = "/<script src=[\\'\\"]?([^\\'\\" ]+).*?>/"; protected $image_pattern = "/<img src=[\\'\\"]?([^\\'\\" ]+).*?>/"; protected $link_pattern = "/<link href=[\\'\\"]?([^\\'\\" ]+).*?>/"; // 样式名称 private $theme_name = null; // 样式的地址 private $theme_base_url = null; // 本地样式存储地址 private $themeDir = null; // 当前解析的html数据 private $current_data = null; public function __construct($name=false, $url=false){ $this->theme_name = ($name) ? $name : false; $this->theme_base_url = ($url) ? $url : false; } public function setTheme($opt) { $this->theme_name = $opt['name']; $this->theme_base_url = $opt['url']; } public function getTheme() { set_time_limit(0); // 从首页开始爬 $this->themeDir = getcwd().'/theme/'.$this->theme_name."/"; // 创建目录 if (! file_exists($this->themeDir)) { @mkdir($this->themeDir, 0755); } //$this->getStyleImages(); // 获取首页 $this->current_data = $this->getHtmlData("index.html"); // 下载所有的HTML $this->getHtml(); // 下载首页的资源文件 $this->showMsg("index.html"); $this->downloadResource(); foreach ($this->_html_resource as $key => $html) { $this->showMsg($html); // 下载每一个html文件的资源 unset($this->current_data); $this->current_data = $this->getHtmlData($html); $this->downloadResource(); } } private function showMsg($html) { echo "download resource $html\\n"; echo str_repeat("-", 30)."\\n"; } private function getHtmlData($file_name) { $data = false; $file_path = $this->themeDir.$file_name; if (file_exists($file_path)) { $data = file_get_contents($file_path); } else { $data = file_get_contents($this->theme_base_url.$file_name); file_put_contents($file_path, $data); } return $data; } private function getHtml() { preg_match_all($this->href_patten, $this->current_data, $href_match); foreach ($href_match[1] as $key => $value) { if (preg_match("/^(.*)?\\.(html)$/", $value)) { array_push($this->_html_resource, $value); $file_path = $this->themeDir.$value; if (! file_exists($file_path)) { $this->downloadFile($value); } } } } private function downloadResource() { // 下载JS脚本 preg_match_all($this->script_pattern, $this->current_data, $script_match); foreach ($script_match[1] as $key => $value) { $this->createDirectory($value); $this->downloadFile($value); } // 下载图片 preg_match_all($this->image_pattern, $this->current_data, $image_match); foreach ($image_match[1] as $key => $value) { $this->createDirectory($value); $this->downloadFile($value); } // 下载CSS样式 preg_match_all($this->link_pattern, $this->current_data, $link_match); foreach ($link_match[1] as $key => $value) { $this->createDirectory($value); $this->downloadFile($value); } } private function downloadFile($filename) { $file_location = $this->themeDir.$filename; if (file_exists($this->themeDir.$filename)) { echo "file already download $file_location\\n"; return; } $curl = curl_init($this->theme_base_url.$filename); curl_setopt($curl,CURLOPT_RETURNTRANSFER,1); $resourceData = curl_exec($curl); curl_close($curl); $fh = fopen($file_location, 'a'); if (is_resource($fh)) { fwrite($fh, $resourceData); fclose($fh); echo "download file ".$file_location."\\n"; } echo "fail download file ".$file_location."\\n"; } private function createDirectory($str) { $str = substr($str, 0, strrpos($str, "/")); $dir = explode("/", $str); $tmp_dir = $this->themeDir; foreach ($dir as $key => $value) { $tmp_dir = $tmp_dir."/".$value; if (!file_exists($tmp_dir)) { @mkdir($tmp_dir, 0755); } } } /*@todo 下载css中额外加载的CSS文件*/ private function getStyleImages() { $style_path = array( "css/style.default.css", "css/prettyPhoto.css", 'css/bootstrap.min.css', 'css/bootstrap-override.css', 'css/weather-icons.min.css', 'css/jquery-ui-1.10.3.css', 'css/font-awesome.min.css', 'css/animate.min.css', 'css/animate.delay.css', 'css/toggles.css', 'css/select2.css', 'css/lato.css', 'css/roboto.css' ); foreach ($style_path as $value) { $data = $this->getHtmlData($value); preg_match_all("/url[\\(]?(.*)[\\)]/", $data, $match); foreach ($match[1] as $image) { $realImagePath = substr($image, 3); if (preg_match("/^(.*)?\\.(png)|(jpg)$/", $realImagePath)) { $this->createDirectory($realImagePath); $this->downloadFile($realImagePath); } } unset($data); } } } //"bracket","http://themepixels.com/demo/webpage/bracket/" $nr = new NetworkReptiles("bracket", "http://themepixels.com/demo/webpage/bracket/"); $nr->getTheme();