diff --git a/QueryList.class.php b/QueryList.class.php index 16c3e6b..3666f2d 100644 --- a/QueryList.class.php +++ b/QueryList.class.php @@ -7,7 +7,7 @@ * @author Jaeger * @email 734708094@qq.com * @link http://git.oschina.net/jae/QueryList - * @version 2.2.0 + * @version 2.2.1 * * @example * @@ -55,6 +55,8 @@ class QueryList private $outputEncoding; private $htmlEncoding; private static $ql; + private function __construct() { + } /** * 静态方法,访问入口 * @param string $page 要抓取的网页URL地址(支持https);或者是html源代码 @@ -119,7 +121,7 @@ class QueryList } //获取编码格式 $this->htmlEncoding = $this->_getEncode($this->html); - $this->html = $this->_removeTags($this->html,array('script','style')); + // $this->html = $this->_removeTags($this->html,array('script','style')); if (!empty($regArr)) { $this->regArr = $regArr; $this->regRange = $regRange; diff --git a/demo/demo.php b/demo/demo.php index 63eb10b..dc4c7c6 100644 --- a/demo/demo.php +++ b/demo/demo.php @@ -1,17 +1,20 @@ array(".code_title a:eq(0)","text"),"url"=>array(".code_title a:eq(0)","href"),"author"=>array("img","title")); $rang = ".code_list li"; -//使用curl抓取源码并以GB2312编码格式输出 -$hj = QueryList::Query($url,$reg,$rang,'curl','GB2312'); +//使用curl抓取源码并以GBK编码格式输出 +$hj = QueryList::Query($url,$reg,$rang,'curl','GBK'); $arr = $hj->jsonArr; echo "
";
 print_r($arr);
 echo "

"; +echo '上面的是GBK格式输出的,而页面是UTF-8格式的,所以会看到输出是乱码!'; +echo '
'; + //如果还想采当前页面右边的 TOP40活跃贡献者 图像,得到JSON数据,可以这样写 $reg = array("portrait"=>array(".hot_top img","src")); $hj->setQuery($reg); @@ -25,4 +28,41 @@ $hj = QueryList::Query($url,$reg); $arr = $hj->jsonArr; echo "
";
 print_r($arr);
-echo "

"; \ No newline at end of file +echo "
"; + +//抓取网站基本信息 +//设置规则 +$reg = array( + //抓取网站keywords + "kw" => array("meta[name=keywords]","content"), + //抓取网站描述 + "desc" => array("meta[name=description]","content"), + //抓取网站标题 + "title" => array("title","text"), + //抓取网站第一个css link的链接 + "css1" => array("link:eq(0)","href"), + //抓取网站第二个js link的链接 + "js2" => array("script[src]:eq(1)","src") + ); +//抓取的目标站 +$url = 'http://x.44i.cc/'; +//抓取 +$data = QueryList::Query($url,$reg)->jsonArr; +print_r($data); + +//下面单独演示回调函数的用法 +//抓取网站keywords并分离每个关键词 +$reg = array( + //抓取网站keywords,并调用自定义函数fun + "kw" => array("meta[name=keywords]","content",'','fun') + ); +//自定义回调函数 +function fun($content,$key){ + //分离关键词 + return explode(',', $content); +} +//抓取的目标站 +$url = 'http://x.44i.cc/'; +//抓取 +$data = QueryList::Query($url,$reg)->jsonArr; +print_r($data); \ No newline at end of file diff --git a/demo/一个完整的DEMO项目.rar b/demo/一个完整的DEMO项目.rar deleted file mode 100644 index c50397c..0000000 Binary files a/demo/一个完整的DEMO项目.rar and /dev/null differ diff --git a/demo/一个完整的DEMO项目.zip b/demo/一个完整的DEMO项目.zip new file mode 100644 index 0000000..74d997e Binary files /dev/null and b/demo/一个完整的DEMO项目.zip differ