@version1.0 这个只能爬取静态的网站模版,代码逻辑需要优化。 正确的逻辑应该是从首页开始爬, 获取HTML,JS,IMAGE,CSS,然后从CSS中分析额外加载的CSS,最后分析所有的CSS中包含的图片引用
这个只能爬取静态的网站模版,代码逻辑需要优化。
正确的逻辑应该是从首页开始爬,
获取HTML,JS,IMAGE,CSS,然后从CSS中分析额外加载的CSS,最后分析所有的CSS中包含的图片引用。
<?php
class NetworkReptiles
{
// 正则解析图片、JS、CSS、等资源文件
protected $href_patten = "/<a href=[\\'\\"]?([^\\'\\" ]+).*?>/";
protected $script_pattern = "/<script src=[\\'\\"]?([^\\'\\" ]+).*?>/";
protected $image_pattern = "/<img src=[\\'\\"]?([^\\'\\" ]+).*?>/";
protected $link_pattern = "/<link href=[\\'\\"]?([^\\'\\" ]+).*?>/";
// 样式名称
private $theme_name = null;
// 样式的地址
private $theme_base_url = null;
// 本地样式存储地址
private $themeDir = null;
// 当前解析的html数据
private $current_data = null;
public function __construct($name=false, $url=false){
$this->theme_name = ($name) ? $name : false;
$this->theme_base_url = ($url) ? $url : false;
}
public function setTheme($opt)
{
$this->theme_name = $opt['name'];
$this->theme_base_url = $opt['url'];
}
public function getTheme()
{
set_time_limit(0);
// 从首页开始爬
$this->themeDir = getcwd().'/theme/'.$this->theme_name."/";
// 创建目录
if (! file_exists($this->themeDir)) {
@mkdir($this->themeDir, 0755);
}
//$this->getStyleImages();
// 获取首页
$this->current_data = $this->getHtmlData("index.html");
// 下载所有的HTML
$this->getHtml();
// 下载首页的资源文件
$this->showMsg("index.html");
$this->downloadResource();
foreach ($this->_html_resource as $key => $html) {
$this->showMsg($html);
// 下载每一个html文件的资源
unset($this->current_data);
$this->current_data = $this->getHtmlData($html);
$this->downloadResource();
}
}
private function showMsg($html)
{
echo "download resource $html\\n";
echo str_repeat("-", 30)."\\n";
}
private function getHtmlData($file_name)
{
$data = false;
$file_path = $this->themeDir.$file_name;
if (file_exists($file_path)) {
$data = file_get_contents($file_path);
} else {
$data = file_get_contents($this->theme_base_url.$file_name);
file_put_contents($file_path, $data);
}
return $data;
}
private function getHtml()
{
preg_match_all($this->href_patten, $this->current_data, $href_match);
foreach ($href_match[1] as $key => $value) {
if (preg_match("/^(.*)?\\.(html)$/", $value)) {
array_push($this->_html_resource, $value);
$file_path = $this->themeDir.$value;
if (! file_exists($file_path)) {
$this->downloadFile($value);
}
}
}
}
private function downloadResource()
{
// 下载JS脚本
preg_match_all($this->script_pattern, $this->current_data, $script_match);
foreach ($script_match[1] as $key => $value) {
$this->createDirectory($value);
$this->downloadFile($value);
}
// 下载图片
preg_match_all($this->image_pattern, $this->current_data, $image_match);
foreach ($image_match[1] as $key => $value) {
$this->createDirectory($value);
$this->downloadFile($value);
}
// 下载CSS样式
preg_match_all($this->link_pattern, $this->current_data, $link_match);
foreach ($link_match[1] as $key => $value) {
$this->createDirectory($value);
$this->downloadFile($value);
}
}
private function downloadFile($filename)
{
$file_location = $this->themeDir.$filename;
if (file_exists($this->themeDir.$filename))
{
echo "file already download $file_location\\n";
return;
}
$curl = curl_init($this->theme_base_url.$filename);
curl_setopt($curl,CURLOPT_RETURNTRANSFER,1);
$resourceData = curl_exec($curl);
curl_close($curl);
$fh = fopen($file_location, 'a');
if (is_resource($fh))
{
fwrite($fh, $resourceData);
fclose($fh);
echo "download file ".$file_location."\\n";
}
echo "fail download file ".$file_location."\\n";
}
private function createDirectory($str)
{
$str = substr($str, 0, strrpos($str, "/"));
$dir = explode("/", $str);
$tmp_dir = $this->themeDir;
foreach ($dir as $key => $value) {
$tmp_dir = $tmp_dir."/".$value;
if (!file_exists($tmp_dir)) {
@mkdir($tmp_dir, 0755);
}
}
}
/*@todo 下载css中额外加载的CSS文件*/
private function getStyleImages()
{
$style_path = array(
"css/style.default.css",
"css/prettyPhoto.css",
'css/bootstrap.min.css',
'css/bootstrap-override.css',
'css/weather-icons.min.css',
'css/jquery-ui-1.10.3.css',
'css/font-awesome.min.css',
'css/animate.min.css',
'css/animate.delay.css',
'css/toggles.css',
'css/select2.css',
'css/lato.css',
'css/roboto.css'
);
foreach ($style_path as $value) {
$data = $this->getHtmlData($value);
preg_match_all("/url[\\(]?(.*)[\\)]/", $data, $match);
foreach ($match[1] as $image) {
$realImagePath = substr($image, 3);
if (preg_match("/^(.*)?\\.(png)|(jpg)$/", $realImagePath))
{
$this->createDirectory($realImagePath);
$this->downloadFile($realImagePath);
}
}
unset($data);
}
}
}
//"bracket","http://themepixels.com/demo/webpage/bracket/"
$nr = new NetworkReptiles("bracket", "http://themepixels.com/demo/webpage/bracket/");
$nr->getTheme();
