OK

2014-06-12 14:58:02 +08:00
parent 3b7078a1ac
commit 0261fe2f22
5 changed files with 102 additions and 28 deletions
--- a/QueryList.class.php
+++ b/QueryList.class.php
@@ -7,42 +7,93 @@
 * @author 			Jaeger
 * @email 			734708094@qq.com
 * @link            http://git.oschina.net/jae/QueryList
- * @version         1.6.1     
+ * @version         2.0.0     
 *
 * @example 
 *
 //获取CSDN移动开发栏目下的文章列表标题
 $hj = QueryList::Query('http://mobile.csdn.net/',array("title"=>array('.unit h1','text')));
 print_r($hj->jsonArr);
 //获取CSDN文章页下面的文章标题和内容
 $url = 'http://www.csdn.net/article/2014-06-05/2820091-build-or-buy-a-mobile-game-backend';
 $reg = array(
    'title'=>array('h1','text'),    //获取纯文本格式的标题                   
    'summary'=>array('.summary','text','input strong'), //获取纯文本的文章摘要，但保留input和strong标签
    'content'=>array('.news_content','html','div a')    //获取html格式的文章内容，但过滤掉div和a标签
    );
 $rang = '.left';
 $hj = QueryList::Query($url,$reg,$rang,'curl');
 print_r($hj->jsonArr);
 //继续获取右边相关热门文章列表的标题以及链接地址
 $hj->setQuery(array('title'=>array('','text'),'url'=>array('a','href')),'#con_two_2 li');
 //输出json数据
 echo $hj->getJson();
 */
 require 'phpQuery/phpQuery.php';
 class QueryList
 {
    private $pageURL;
    private $regArr;
    public $jsonArr;
    private $regRange;
    private $html;
    private $outputEncoding;
    private $htmlEncoding;
    private static $ql;
    /**
-     * 构造函数
+     * 静态方法，访问入口
     * @param string $page            要抓取的网页URL地址(支持https);或者是html源代码
-     * @param array  $regArr         【选择器数组】说明：格式array("名称"=>array("选择器","类型"),.......),【类型】说明：值 "text" ,"html" ,"属性" 
+     * @param array  $regArr         【选择器数组】说明：格式array("名称"=>array("选择器","类型"[,"标签列表"]),.......),【类型】说明：值 "text" ,"html" ,"属性" ,【标签列表】:可选，当【类型】值为text时表示需要保留的HTML标签，为html时表示要过滤掉的HTML标签
     * @param string $regRange       【块选择器】：指 先按照规则 选出 几个大块 ，然后再分别再在块里面 进行相关的选择
     * @param string $getHtmlWay     【源码获取方式】指是通过curl抓取源码，还是通过file_get_contents抓取源码
     * @param string $outputEncoding【输出编码格式】指要以什么编码输出(UTF-8,GB2312,.....)，防止出现乱码,如果设置为 假值 则不改变原字符串编码
     */
-    public function __construct($page, $regArr, $regRange = '', $getHtmlWay = 'curl', $outputEncoding = false)
+    public static function Query($page, $regArr, $regRange = '', $getHtmlWay = 'curl', $outputEncoding = false)
    {
        if(!(self::$ql instanceof self))
        {
            self::$ql = new self();
        }
        self::$ql->_query($page, $regArr, $regRange, $getHtmlWay, $outputEncoding);
        return self::$ql;
    }
    /**
     * 重新设置选择器
     * @param array $regArr   选择器数组
     * @param string $regRange 块选择器
     */
    public function setQuery($regArr, $regRange = '')
    {
        $this->jsonArr = array();
        $this->regArr = $regArr;
        $this->regRange = $regRange;
        $this->_getList();
    }
    /**
     * 得到JSON结构的结果
     * @return string
     */
    public function getJSON()
    {
        return json_encode($this->jsonArr);
    }
    private function _query($page, $regArr, $regRange, $getHtmlWay, $outputEncoding)
    {
        $this->jsonArr = array();
        $this->outputEncoding = $outputEncoding;
        if ($this->_isURL($page)) {
            $this->pageURL = $page;
            if ($getHtmlWay == 'curl') {
                //为了能获取https://
                $ch = curl_init();
-                curl_setopt($ch, CURLOPT_URL, $this->pageURL);
+                curl_setopt($ch, CURLOPT_URL, $page);
                curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false);
                curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, false);
                curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
                $this->html = curl_exec($ch);
                curl_close($ch);
            } else {
-                $this->html = file_get_contents($this->pageURL);
+                $this->html = file_get_contents($page);
            }
        } else {
            $this->html = $page;
@@ -55,13 +106,6 @@ class QueryList
            $this->_getList();
        }
    }
    public function setQuery($regArr, $regRange = '')
    {
        $this->jsonArr = array();
        $this->regArr = $regArr;
        $this->regRange = $regRange;
        $this->_getList();
    }
    private function _getList()
    {
        $hobj = phpQuery::newDocumentHTML($this->html);
@@ -70,13 +114,14 @@ class QueryList
            $i = 0;
            foreach ($robj as $item) {
                while (list($key, $reg_value) = each($this->regArr)) {
                    $tags = isset($reg_value[2])?$reg_value[2]:'';
                    $iobj = pq($item)->find($reg_value[0]);
                    switch ($reg_value[1]) {
                    case 'text':
-                        $this->jsonArr[$i][$key] = trim(pq($iobj)->text());
+                        $this->jsonArr[$i][$key] = $this->_allowTags(pq($iobj)->html(),$tags);
                        break;
                    case 'html':
-                        $this->jsonArr[$i][$key] = trim(pq($iobj)->html());
+                        $this->jsonArr[$i][$key] = $this->_stripTags(pq($iobj)->html(),$tags);
                        break;
                    default:
                        $this->jsonArr[$i][$key] = pq($iobj)->attr($reg_value[1]);
@@ -89,15 +134,16 @@ class QueryList
            }
        } else {
            while (list($key, $reg_value) = each($this->regArr)) {
                $tags = isset($reg_value[2])?$reg_value[2]:'';
                $lobj = pq($hobj)->find($reg_value[0]);
                $i = 0;
                foreach ($lobj as $item) {
                    switch ($reg_value[1]) {
                    case 'text':
-                        $this->jsonArr[$i++][$key] = trim(pq($item)->text());
+                        $this->jsonArr[$i++][$key] = $this->_allowTags(pq($item)->html(),$tags);
                        break;
                    case 'html':
-                        $this->jsonArr[$i++][$key] = trim(pq($item)->html());
+                        $this->jsonArr[$i++][$key] = $this->_stripTags(pq($item)->html(),$tags);
                        break;
                    default:
                        $this->jsonArr[$i++][$key] = pq($item)->attr($reg_value[1]);
@@ -111,10 +157,6 @@ class QueryList
            $this->jsonArr = $this->_arrayConvertEncoding($this->jsonArr, $this->outputEncoding, $this->htmlEncoding);
        }
    }
    public function getJSON()
    {
        return json_encode($this->jsonArr);
    }
    /**
     * 获取文件编码
     * @param $string
@@ -157,4 +199,36 @@ class QueryList
        }
        return false;
    }
-}
+    /**
     * 去除特定的html标签
     * @param  string $html 
     * @param  string $tags 多个标签名之间用空格隔开
     * @return string       
     */
    private function _stripTags($html,$tags)
    {
        $tagsArr = preg_split("/\s+/",$tags,-1,PREG_SPLIT_NO_EMPTY);
        $p = array();
        foreach ($tagsArr as $tag) {  
            $p[]="/(<(?:\/".$tag."|".$tag.")[^>]*>)/i";  
        }  
        $html = preg_replace($p,"",trim($html));  
        return $html;  
    }
    /**
     * 保留特定的html标签
     * @param  string $html 
     * @param  string $tags 多个标签名之间用空格隔开
     * @return string       
     */
    private function _allowTags($html,$tags)
    {
        $tagsArr = preg_split("/\s+/",$tags,-1,PREG_SPLIT_NO_EMPTY);
        $allow = '';
        foreach ($tagsArr as $tag) {
            $allow .= "<$tag> ";
        }
        return strip_tags(trim($html),$allow);
    }
 }
--- a/README.md
+++ b/README.md
@@ -1 +1 @@
-
+
--- a/demo/Searcher.class.php
+++ b/demo/Searcher.class.php
@@ -48,7 +48,7 @@ require '../QueryList.class.php';
               $reg_znum='/([\d,]+) result(s)?/';
               $getHtmlWay = 'curl';
           }
-           $searcherObj = new QueryList($url,$this->regArr,$this->regRange,$getHtmlWay,false);
+           $searcherObj = QueryList::Query($url,$this->regArr,$this->regRange,$getHtmlWay,false);
           for($i=0;$i<count($searcherObj->jsonArr);$i++)
           {
               if($this->searcher=='baidu')
--- a/demo/demo.php
+++ b/demo/demo.php
@@ -6,7 +6,7 @@ $url = "http://www.oschina.net/code/list";
 $reg = array("title"=>array(".code_title a:eq(0)","text"),"url"=>array(".code_title a:eq(0)","href"),"author"=>array("img","title"));
 $rang = ".code_list li";
 //使用curl抓取源码并以GB2312编码格式输出
-$hj = new QueryList($url,$reg,$rang,'curl','GB2312');
+$hj = QueryList::Query($url,$reg,$rang,'curl','GB2312');
 $arr = $hj->jsonArr;
 echo "<pre>";
 print_r($arr);
@@ -21,7 +21,7 @@ echo $json . "<hr/>";
 //采OSC内容页内容
 $url = "http://www.oschina.net/code/snippet_186288_23816";
 $reg = array("title"=>array(".QTitle h1","text"),"con"=>array(".Content","html"));
-$hj = new QueryList($url,$reg);
+$hj = QueryList::Query($url,$reg);
 $arr = $hj->jsonArr;
 echo "<pre>";
 print_r($arr);
--- a/demo/thanks.png
+++ b/demo/thanks.png