From 3b7078a1ac4cdced11377cbeeb46c09d930ef419 Mon Sep 17 00:00:00 2001
From: JAE <hj.q@qq.com>
Date: Wed, 30 Apr 2014 11:31:04 +0800
Subject: [PATCH] =?UTF-8?q?=E8=A7=84=E8=8C=83=E5=91=BD=E5=90=8D?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 QueryList.class.php | 347 ++++++++++++++++++++------------------------
 1 file changed, 158 insertions(+), 189 deletions(-)

diff --git a/QueryList.class.php b/QueryList.class.php
index 91b70bc..01dd960 100644
--- a/QueryList.class.php
+++ b/QueryList.class.php
@@ -1,191 +1,160 @@
 <?php
-	/**
-	 * QueryList
-	 *
-	 * 一个基于phpQuery的通用列表采集类
-	 * 
-	 * @author 			Jaeger
-	 * @email 			734708094@qq.com
-	 * @link            http://git.oschina.net/jae/QueryList
-	 * @version         1.6.0     
-	 */
-	require('phpQuery/phpQuery.php');
-	class QueryList{
-		
-		 private $pageURL;
-		 private $regArr = array();
-		 public $jsonArr = array();
-		 private $regRange;
-		 private $html;
-		 private $output_encoding;
-		 private $html_encoding;
-		 /**
-		  * 构造函数
-		  * @param string $page            要抓取的网页URL地址(支持https);或者是html源代码
-		  * @param array  $regArr         【选择器数组】说明:格式array("名称"=>array("选择器","类型"),.......),【类型】说明:值 "text" ,"html" ,"属性" 
-		  * @param string $regRange       【块选择器】:指 先按照规则 选出 几个大块 ,然后再分别再在块里面 进行相关的选择
-		  * @param string $getHtmlWay     【源码获取方式】指是通过curl抓取源码,还是通过file_get_contents抓取源码
-		  * @param string $output_encoding【输出编码格式】指要以什么编码输出(UTF-8,GB2312,.....),防止出现乱码,如果设置为 假值 则不改变原字符串编码
-		  */
-		public function QueryList($page,$regArr,$regRange='',$getHtmlWay="curl",$output_encoding=false)
-		 {
-		 	
-			$this->output_encoding = $output_encoding;
-			if($this->isURL($page))
-		 	{
-				$this->pageURL = $page;
-				if($getHtmlWay=="curl")
-		        {
-		       	 //为了能获取https://
-				   $ch = curl_init();
-					curl_setopt($ch, CURLOPT_URL,$this->pageURL);
-					curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false); 
-					curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, false);
-					curl_setopt($ch,CURLOPT_RETURNTRANSFER,1); 
-					$this->html = curl_exec($ch);
-	               curl_close($ch);
-		        }else{
-		       		$this->html=file_get_contents($this->pageURL);
-		        }
-		 	}else{
-		 		$this->html = $page;
-		 	}
-	       
-		   //获取编码格式
-		   $this->html_encoding = $this->get_encode($this->html);
-
-
-			 if(!empty($regArr))
-			 {
-			
-				  $this->regArr = $regArr;
-				 $this->regRange = $regRange;
-				 $this->getList();
-			 }
-			   
-		 }
-		public function setQuery($regArr,$regRange='')
-		 {
-			 $this->jsonArr=array();
-			 $this->regArr = $regArr;
-			 $this->regRange = $regRange;
-			 $this->getList();
-	     }
-	    private function getList()
-		 {
-			 
-             $hobj = phpQuery::newDocumentHTML($this->html);
-
-			 if(!empty($this->regRange))
-			 {
-			 $robj = pq($hobj)->find($this->regRange);
-			
-			  $i=0;
-			 foreach($robj as $item)
-			 {
-			     
-				 while(list($key,$reg_value)=each($this->regArr))
-				 {
-					 $iobj = pq($item)->find($reg_value[0]);
-					
-					   switch($reg_value[1])
-					   {
-						   case 'text':
-						   		 $this->jsonArr[$i][$key] = trim(pq($iobj)->text());
-								 break;
-				           case 'html':
-						  		 $this->jsonArr[$i][$key] = trim(pq($iobj)->html());
-								 break;
-						   default:
-						   		$this->jsonArr[$i][$key] = pq($iobj)->attr($reg_value[1]);
-								break;
-						   
-						}
-				 }
-				 //重置数组指针
-				 reset($this->regArr);
-				 $i++;
-			  }
-			 }
-			 else
-			 {
-		    while(list($key,$reg_value)=each($this->regArr))
-			 {
-				$lobj = pq($hobj)->find($reg_value[0]);
-				   
-				   
-				   $i=0;
-				   foreach($lobj as $item)
-				   {
-					   switch($reg_value[1])
-					   {
-						   case 'text':
-						   		 $this->jsonArr[$i++][$key] = trim(pq($item)->text());
-								 break;
-				           case 'html':
-						  		 $this->jsonArr[$i++][$key] = trim(pq($item)->html());
-								 break;
-						   default:
-						   		$this->jsonArr[$i++][$key] = pq($item)->attr($reg_value[1]);
-								break;
-						   
-						}
-					  
-					 
-				   }
-				 
-		
-			 }
-		   }
-		   if($this->output_encoding)
-		   {
-			   //编码转换
-			   $this->jsonArr = $this->array_convert_encoding($this->jsonArr,$this->output_encoding,$this->html_encoding);
-		   }
-		 }	
-		public function getJSON()
-		 {
-			 return json_encode($this->jsonArr);
-		 } 
-		/**
-		 * 获取文件编码
-		 * @param $string
-		 * @return string
-		 */
-		private function get_encode($string){
-		    return mb_detect_encoding($string, array('ASCII','GB2312','GBK','UTF-8')); 
-		}
-		/**
-		 * 递归转换数组值得编码格式
-		 * @param  array $arr           
-		 * @param  string $to_encoding   
-		 * @param  string $from_encoding 
-		 * @return array                
-		 */
-		private function array_convert_encoding($arr,$to_encoding,$from_encoding)
-		{
-		    if(!is_array($arr))return $arr;
-		    foreach ($arr as $key => $value) {
-		        if (is_array($value)) {
-		           $arr[$key] = $this->array_convert_encoding($value,$to_encoding,$from_encoding);
-		        }else{
-		           $arr[$key] = mb_convert_encoding($value, $to_encoding,$from_encoding);
-		        }
-		    }
-		    return $arr;
-		}
-		/**
-		 * 简单的判断一下参数是否为一个URL链接
-		 * @param  string  $str 
-		 * @return boolean      
-		 */
-		private function isURL($str)
-		{
-			if(preg_match('/^http(s)?:\/\/.+/', $str))
-			{
-				return true;
-			}
-			return false;
-		}
-		
+/**
+ * QueryList
+ *
+ * 一个基于phpQuery的通用列表采集类
+ * 
+ * @author 			Jaeger
+ * @email 			734708094@qq.com
+ * @link            http://git.oschina.net/jae/QueryList
+ * @version         1.6.1     
+ */
+require 'phpQuery/phpQuery.php';
+class QueryList
+{
+    private $pageURL;
+    private $regArr;
+    public $jsonArr;
+    private $regRange;
+    private $html;
+    private $outputEncoding;
+    private $htmlEncoding;
+    /**
+     * 构造函数
+     * @param string $page            要抓取的网页URL地址(支持https);或者是html源代码
+     * @param array  $regArr         【选择器数组】说明:格式array("名称"=>array("选择器","类型"),.......),【类型】说明:值 "text" ,"html" ,"属性" 
+     * @param string $regRange       【块选择器】:指 先按照规则 选出 几个大块 ,然后再分别再在块里面 进行相关的选择
+     * @param string $getHtmlWay     【源码获取方式】指是通过curl抓取源码,还是通过file_get_contents抓取源码
+     * @param string $outputEncoding【输出编码格式】指要以什么编码输出(UTF-8,GB2312,.....),防止出现乱码,如果设置为 假值 则不改变原字符串编码
+     */
+    public function __construct($page, $regArr, $regRange = '', $getHtmlWay = 'curl', $outputEncoding = false)
+    {
+        $this->outputEncoding = $outputEncoding;
+        if ($this->_isURL($page)) {
+            $this->pageURL = $page;
+            if ($getHtmlWay == 'curl') {
+                //为了能获取https://
+                $ch = curl_init();
+                curl_setopt($ch, CURLOPT_URL, $this->pageURL);
+                curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false);
+                curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, false);
+                curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
+                $this->html = curl_exec($ch);
+                curl_close($ch);
+            } else {
+                $this->html = file_get_contents($this->pageURL);
+            }
+        } else {
+            $this->html = $page;
+        }
+        //获取编码格式
+        $this->htmlEncoding = $this->_getEncode($this->html);
+        if (!empty($regArr)) {
+            $this->regArr = $regArr;
+            $this->regRange = $regRange;
+            $this->_getList();
+        }
+    }
+    public function setQuery($regArr, $regRange = '')
+    {
+        $this->jsonArr = array();
+        $this->regArr = $regArr;
+        $this->regRange = $regRange;
+        $this->_getList();
+    }
+    private function _getList()
+    {
+        $hobj = phpQuery::newDocumentHTML($this->html);
+        if (!empty($this->regRange)) {
+            $robj = pq($hobj)->find($this->regRange);
+            $i = 0;
+            foreach ($robj as $item) {
+                while (list($key, $reg_value) = each($this->regArr)) {
+                    $iobj = pq($item)->find($reg_value[0]);
+                    switch ($reg_value[1]) {
+                    case 'text':
+                        $this->jsonArr[$i][$key] = trim(pq($iobj)->text());
+                        break;
+                    case 'html':
+                        $this->jsonArr[$i][$key] = trim(pq($iobj)->html());
+                        break;
+                    default:
+                        $this->jsonArr[$i][$key] = pq($iobj)->attr($reg_value[1]);
+                        break;
+                    }
+                }
+                //重置数组指针
+                reset($this->regArr);
+                $i++;
+            }
+        } else {
+            while (list($key, $reg_value) = each($this->regArr)) {
+                $lobj = pq($hobj)->find($reg_value[0]);
+                $i = 0;
+                foreach ($lobj as $item) {
+                    switch ($reg_value[1]) {
+                    case 'text':
+                        $this->jsonArr[$i++][$key] = trim(pq($item)->text());
+                        break;
+                    case 'html':
+                        $this->jsonArr[$i++][$key] = trim(pq($item)->html());
+                        break;
+                    default:
+                        $this->jsonArr[$i++][$key] = pq($item)->attr($reg_value[1]);
+                        break;
+                    }
+                }
+            }
+        }
+        if ($this->outputEncoding) {
+            //编码转换
+            $this->jsonArr = $this->_arrayConvertEncoding($this->jsonArr, $this->outputEncoding, $this->htmlEncoding);
+        }
+    }
+    public function getJSON()
+    {
+        return json_encode($this->jsonArr);
+    }
+    /**
+     * 获取文件编码
+     * @param $string
+     * @return string
+     */
+    private function _getEncode($string)
+    {
+        return mb_detect_encoding($string, array('ASCII', 'GB2312', 'GBK', 'UTF-8'));
+    }
+    /**
+     * 递归转换数组值得编码格式
+     * @param  array $arr           
+     * @param  string $toEncoding   
+     * @param  string $fromEncoding 
+     * @return array                
+     */
+    private function _arrayConvertEncoding($arr, $toEncoding, $fromEncoding)
+    {
+        if (!is_array($arr)) {
+            return $arr;
+        }
+        foreach ($arr as $key => $value) {
+            if (is_array($value)) {
+                $arr[$key] = $this->_arrayConvertEncoding($value, $toEncoding, $fromEncoding);
+            } else {
+                $arr[$key] = mb_convert_encoding($value, $toEncoding, $fromEncoding);
+            }
+        }
+        return $arr;
+    }
+    /**
+     * 简单的判断一下参数是否为一个URL链接
+     * @param  string  $str 
+     * @return boolean      
+     */
+    private function _isURL($str)
+    {
+        if (preg_match('/^http(s)?:\\/\\/.+/', $str)) {
+            return true;
+        }
+        return false;
+    }
 }
\ No newline at end of file