diff --git a/QueryList.class.php b/QueryList.class.php index 91b70bc..01dd960 100644 --- a/QueryList.class.php +++ b/QueryList.class.php @@ -1,191 +1,160 @@ array("选择器","类型"),.......),【类型】说明:值 "text" ,"html" ,"属性" - * @param string $regRange 【块选择器】:指 先按照规则 选出 几个大块 ,然后再分别再在块里面 进行相关的选择 - * @param string $getHtmlWay 【源码获取方式】指是通过curl抓取源码,还是通过file_get_contents抓取源码 - * @param string $output_encoding【输出编码格式】指要以什么编码输出(UTF-8,GB2312,.....),防止出现乱码,如果设置为 假值 则不改变原字符串编码 - */ - public function QueryList($page,$regArr,$regRange='',$getHtmlWay="curl",$output_encoding=false) - { - - $this->output_encoding = $output_encoding; - if($this->isURL($page)) - { - $this->pageURL = $page; - if($getHtmlWay=="curl") - { - //为了能获取https:// - $ch = curl_init(); - curl_setopt($ch, CURLOPT_URL,$this->pageURL); - curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false); - curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, false); - curl_setopt($ch,CURLOPT_RETURNTRANSFER,1); - $this->html = curl_exec($ch); - curl_close($ch); - }else{ - $this->html=file_get_contents($this->pageURL); - } - }else{ - $this->html = $page; - } - - //获取编码格式 - $this->html_encoding = $this->get_encode($this->html); - - - if(!empty($regArr)) - { - - $this->regArr = $regArr; - $this->regRange = $regRange; - $this->getList(); - } - - } - public function setQuery($regArr,$regRange='') - { - $this->jsonArr=array(); - $this->regArr = $regArr; - $this->regRange = $regRange; - $this->getList(); - } - private function getList() - { - - $hobj = phpQuery::newDocumentHTML($this->html); - - if(!empty($this->regRange)) - { - $robj = pq($hobj)->find($this->regRange); - - $i=0; - foreach($robj as $item) - { - - while(list($key,$reg_value)=each($this->regArr)) - { - $iobj = pq($item)->find($reg_value[0]); - - switch($reg_value[1]) - { - case 'text': - $this->jsonArr[$i][$key] = trim(pq($iobj)->text()); - break; - case 'html': - $this->jsonArr[$i][$key] = trim(pq($iobj)->html()); - break; - default: - $this->jsonArr[$i][$key] = pq($iobj)->attr($reg_value[1]); - break; - - } - } - //重置数组指针 - reset($this->regArr); - $i++; - } - } - else - { - while(list($key,$reg_value)=each($this->regArr)) - { - $lobj = pq($hobj)->find($reg_value[0]); - - - $i=0; - foreach($lobj as $item) - { - switch($reg_value[1]) - { - case 'text': - $this->jsonArr[$i++][$key] = trim(pq($item)->text()); - break; - case 'html': - $this->jsonArr[$i++][$key] = trim(pq($item)->html()); - break; - default: - $this->jsonArr[$i++][$key] = pq($item)->attr($reg_value[1]); - break; - - } - - - } - - - } - } - if($this->output_encoding) - { - //编码转换 - $this->jsonArr = $this->array_convert_encoding($this->jsonArr,$this->output_encoding,$this->html_encoding); - } - } - public function getJSON() - { - return json_encode($this->jsonArr); - } - /** - * 获取文件编码 - * @param $string - * @return string - */ - private function get_encode($string){ - return mb_detect_encoding($string, array('ASCII','GB2312','GBK','UTF-8')); - } - /** - * 递归转换数组值得编码格式 - * @param array $arr - * @param string $to_encoding - * @param string $from_encoding - * @return array - */ - private function array_convert_encoding($arr,$to_encoding,$from_encoding) - { - if(!is_array($arr))return $arr; - foreach ($arr as $key => $value) { - if (is_array($value)) { - $arr[$key] = $this->array_convert_encoding($value,$to_encoding,$from_encoding); - }else{ - $arr[$key] = mb_convert_encoding($value, $to_encoding,$from_encoding); - } - } - return $arr; - } - /** - * 简单的判断一下参数是否为一个URL链接 - * @param string $str - * @return boolean - */ - private function isURL($str) - { - if(preg_match('/^http(s)?:\/\/.+/', $str)) - { - return true; - } - return false; - } - +/** + * QueryList + * + * 一个基于phpQuery的通用列表采集类 + * + * @author Jaeger + * @email 734708094@qq.com + * @link http://git.oschina.net/jae/QueryList + * @version 1.6.1 + */ +require 'phpQuery/phpQuery.php'; +class QueryList +{ + private $pageURL; + private $regArr; + public $jsonArr; + private $regRange; + private $html; + private $outputEncoding; + private $htmlEncoding; + /** + * 构造函数 + * @param string $page 要抓取的网页URL地址(支持https);或者是html源代码 + * @param array $regArr 【选择器数组】说明:格式array("名称"=>array("选择器","类型"),.......),【类型】说明:值 "text" ,"html" ,"属性" + * @param string $regRange 【块选择器】:指 先按照规则 选出 几个大块 ,然后再分别再在块里面 进行相关的选择 + * @param string $getHtmlWay 【源码获取方式】指是通过curl抓取源码,还是通过file_get_contents抓取源码 + * @param string $outputEncoding【输出编码格式】指要以什么编码输出(UTF-8,GB2312,.....),防止出现乱码,如果设置为 假值 则不改变原字符串编码 + */ + public function __construct($page, $regArr, $regRange = '', $getHtmlWay = 'curl', $outputEncoding = false) + { + $this->outputEncoding = $outputEncoding; + if ($this->_isURL($page)) { + $this->pageURL = $page; + if ($getHtmlWay == 'curl') { + //为了能获取https:// + $ch = curl_init(); + curl_setopt($ch, CURLOPT_URL, $this->pageURL); + curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false); + curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, false); + curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1); + $this->html = curl_exec($ch); + curl_close($ch); + } else { + $this->html = file_get_contents($this->pageURL); + } + } else { + $this->html = $page; + } + //获取编码格式 + $this->htmlEncoding = $this->_getEncode($this->html); + if (!empty($regArr)) { + $this->regArr = $regArr; + $this->regRange = $regRange; + $this->_getList(); + } + } + public function setQuery($regArr, $regRange = '') + { + $this->jsonArr = array(); + $this->regArr = $regArr; + $this->regRange = $regRange; + $this->_getList(); + } + private function _getList() + { + $hobj = phpQuery::newDocumentHTML($this->html); + if (!empty($this->regRange)) { + $robj = pq($hobj)->find($this->regRange); + $i = 0; + foreach ($robj as $item) { + while (list($key, $reg_value) = each($this->regArr)) { + $iobj = pq($item)->find($reg_value[0]); + switch ($reg_value[1]) { + case 'text': + $this->jsonArr[$i][$key] = trim(pq($iobj)->text()); + break; + case 'html': + $this->jsonArr[$i][$key] = trim(pq($iobj)->html()); + break; + default: + $this->jsonArr[$i][$key] = pq($iobj)->attr($reg_value[1]); + break; + } + } + //重置数组指针 + reset($this->regArr); + $i++; + } + } else { + while (list($key, $reg_value) = each($this->regArr)) { + $lobj = pq($hobj)->find($reg_value[0]); + $i = 0; + foreach ($lobj as $item) { + switch ($reg_value[1]) { + case 'text': + $this->jsonArr[$i++][$key] = trim(pq($item)->text()); + break; + case 'html': + $this->jsonArr[$i++][$key] = trim(pq($item)->html()); + break; + default: + $this->jsonArr[$i++][$key] = pq($item)->attr($reg_value[1]); + break; + } + } + } + } + if ($this->outputEncoding) { + //编码转换 + $this->jsonArr = $this->_arrayConvertEncoding($this->jsonArr, $this->outputEncoding, $this->htmlEncoding); + } + } + public function getJSON() + { + return json_encode($this->jsonArr); + } + /** + * 获取文件编码 + * @param $string + * @return string + */ + private function _getEncode($string) + { + return mb_detect_encoding($string, array('ASCII', 'GB2312', 'GBK', 'UTF-8')); + } + /** + * 递归转换数组值得编码格式 + * @param array $arr + * @param string $toEncoding + * @param string $fromEncoding + * @return array + */ + private function _arrayConvertEncoding($arr, $toEncoding, $fromEncoding) + { + if (!is_array($arr)) { + return $arr; + } + foreach ($arr as $key => $value) { + if (is_array($value)) { + $arr[$key] = $this->_arrayConvertEncoding($value, $toEncoding, $fromEncoding); + } else { + $arr[$key] = mb_convert_encoding($value, $toEncoding, $fromEncoding); + } + } + return $arr; + } + /** + * 简单的判断一下参数是否为一个URL链接 + * @param string $str + * @return boolean + */ + private function _isURL($str) + { + if (preg_match('/^http(s)?:\\/\\/.+/', $str)) { + return true; + } + return false; + } } \ No newline at end of file