规范命名

This commit is contained in:
JAE 2014-04-30 11:31:04 +08:00
parent 8bc6607acd
commit 3b7078a1ac

View File

@ -1,191 +1,160 @@
<?php <?php
/** /**
* QueryList * QueryList
* *
* 一个基于phpQuery的通用列表采集类 * 一个基于phpQuery的通用列表采集类
* *
* @author Jaeger * @author Jaeger
* @email 734708094@qq.com * @email 734708094@qq.com
* @link http://git.oschina.net/jae/QueryList * @link http://git.oschina.net/jae/QueryList
* @version 1.6.0 * @version 1.6.1
*/ */
require('phpQuery/phpQuery.php'); require 'phpQuery/phpQuery.php';
class QueryList{ class QueryList
{
private $pageURL; private $pageURL;
private $regArr = array(); private $regArr;
public $jsonArr = array(); public $jsonArr;
private $regRange; private $regRange;
private $html; private $html;
private $output_encoding; private $outputEncoding;
private $html_encoding; private $htmlEncoding;
/** /**
* 构造函数 * 构造函数
* @param string $page 要抓取的网页URL地址(支持https);或者是html源代码 * @param string $page 要抓取的网页URL地址(支持https);或者是html源代码
* @param array $regArr 【选择器数组】说明格式array("名称"=>array("选择器","类型"),.......),【类型】说明:值 "text" ,"html" ,"属性" * @param array $regArr 【选择器数组】说明格式array("名称"=>array("选择器","类型"),.......),【类型】说明:值 "text" ,"html" ,"属性"
* @param string $regRange 【块选择器】:指 先按照规则 选出 几个大块 ,然后再分别再在块里面 进行相关的选择 * @param string $regRange 【块选择器】:指 先按照规则 选出 几个大块 ,然后再分别再在块里面 进行相关的选择
* @param string $getHtmlWay 【源码获取方式】指是通过curl抓取源码还是通过file_get_contents抓取源码 * @param string $getHtmlWay 【源码获取方式】指是通过curl抓取源码还是通过file_get_contents抓取源码
* @param string $output_encoding【输出编码格式】指要以什么编码输出(UTF-8,GB2312,.....),防止出现乱码,如果设置为 假值 则不改变原字符串编码 * @param string $outputEncoding【输出编码格式】指要以什么编码输出(UTF-8,GB2312,.....),防止出现乱码,如果设置为 假值 则不改变原字符串编码
*/ */
public function QueryList($page,$regArr,$regRange='',$getHtmlWay="curl",$output_encoding=false) public function __construct($page, $regArr, $regRange = '', $getHtmlWay = 'curl', $outputEncoding = false)
{ {
$this->outputEncoding = $outputEncoding;
$this->output_encoding = $output_encoding; if ($this->_isURL($page)) {
if($this->isURL($page)) $this->pageURL = $page;
{ if ($getHtmlWay == 'curl') {
$this->pageURL = $page; //为了能获取https://
if($getHtmlWay=="curl") $ch = curl_init();
{ curl_setopt($ch, CURLOPT_URL, $this->pageURL);
//为了能获取https:// curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false);
$ch = curl_init(); curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, false);
curl_setopt($ch, CURLOPT_URL,$this->pageURL); curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false); $this->html = curl_exec($ch);
curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, false); curl_close($ch);
curl_setopt($ch,CURLOPT_RETURNTRANSFER,1); } else {
$this->html = curl_exec($ch); $this->html = file_get_contents($this->pageURL);
curl_close($ch); }
}else{ } else {
$this->html=file_get_contents($this->pageURL); $this->html = $page;
} }
}else{ //获取编码格式
$this->html = $page; $this->htmlEncoding = $this->_getEncode($this->html);
} if (!empty($regArr)) {
$this->regArr = $regArr;
//获取编码格式 $this->regRange = $regRange;
$this->html_encoding = $this->get_encode($this->html); $this->_getList();
}
}
if(!empty($regArr)) public function setQuery($regArr, $regRange = '')
{ {
$this->jsonArr = array();
$this->regArr = $regArr; $this->regArr = $regArr;
$this->regRange = $regRange; $this->regRange = $regRange;
$this->getList(); $this->_getList();
} }
private function _getList()
} {
public function setQuery($regArr,$regRange='') $hobj = phpQuery::newDocumentHTML($this->html);
{ if (!empty($this->regRange)) {
$this->jsonArr=array(); $robj = pq($hobj)->find($this->regRange);
$this->regArr = $regArr; $i = 0;
$this->regRange = $regRange; foreach ($robj as $item) {
$this->getList(); while (list($key, $reg_value) = each($this->regArr)) {
} $iobj = pq($item)->find($reg_value[0]);
private function getList() switch ($reg_value[1]) {
{ case 'text':
$this->jsonArr[$i][$key] = trim(pq($iobj)->text());
$hobj = phpQuery::newDocumentHTML($this->html); break;
case 'html':
if(!empty($this->regRange)) $this->jsonArr[$i][$key] = trim(pq($iobj)->html());
{ break;
$robj = pq($hobj)->find($this->regRange); default:
$this->jsonArr[$i][$key] = pq($iobj)->attr($reg_value[1]);
$i=0; break;
foreach($robj as $item) }
{ }
//重置数组指针
while(list($key,$reg_value)=each($this->regArr)) reset($this->regArr);
{ $i++;
$iobj = pq($item)->find($reg_value[0]); }
} else {
switch($reg_value[1]) while (list($key, $reg_value) = each($this->regArr)) {
{ $lobj = pq($hobj)->find($reg_value[0]);
case 'text': $i = 0;
$this->jsonArr[$i][$key] = trim(pq($iobj)->text()); foreach ($lobj as $item) {
break; switch ($reg_value[1]) {
case 'html': case 'text':
$this->jsonArr[$i][$key] = trim(pq($iobj)->html()); $this->jsonArr[$i++][$key] = trim(pq($item)->text());
break; break;
default: case 'html':
$this->jsonArr[$i][$key] = pq($iobj)->attr($reg_value[1]); $this->jsonArr[$i++][$key] = trim(pq($item)->html());
break; break;
default:
} $this->jsonArr[$i++][$key] = pq($item)->attr($reg_value[1]);
} break;
//重置数组指针 }
reset($this->regArr); }
$i++; }
} }
} if ($this->outputEncoding) {
else //编码转换
{ $this->jsonArr = $this->_arrayConvertEncoding($this->jsonArr, $this->outputEncoding, $this->htmlEncoding);
while(list($key,$reg_value)=each($this->regArr)) }
{ }
$lobj = pq($hobj)->find($reg_value[0]); public function getJSON()
{
return json_encode($this->jsonArr);
$i=0; }
foreach($lobj as $item) /**
{ * 获取文件编码
switch($reg_value[1]) * @param $string
{ * @return string
case 'text': */
$this->jsonArr[$i++][$key] = trim(pq($item)->text()); private function _getEncode($string)
break; {
case 'html': return mb_detect_encoding($string, array('ASCII', 'GB2312', 'GBK', 'UTF-8'));
$this->jsonArr[$i++][$key] = trim(pq($item)->html()); }
break; /**
default: * 递归转换数组值得编码格式
$this->jsonArr[$i++][$key] = pq($item)->attr($reg_value[1]); * @param array $arr
break; * @param string $toEncoding
* @param string $fromEncoding
} * @return array
*/
private function _arrayConvertEncoding($arr, $toEncoding, $fromEncoding)
} {
if (!is_array($arr)) {
return $arr;
} }
} foreach ($arr as $key => $value) {
if($this->output_encoding) if (is_array($value)) {
{ $arr[$key] = $this->_arrayConvertEncoding($value, $toEncoding, $fromEncoding);
//编码转换 } else {
$this->jsonArr = $this->array_convert_encoding($this->jsonArr,$this->output_encoding,$this->html_encoding); $arr[$key] = mb_convert_encoding($value, $toEncoding, $fromEncoding);
} }
} }
public function getJSON() return $arr;
{ }
return json_encode($this->jsonArr); /**
} * 简单的判断一下参数是否为一个URL链接
/** * @param string $str
* 获取文件编码 * @return boolean
* @param $string */
* @return string private function _isURL($str)
*/ {
private function get_encode($string){ if (preg_match('/^http(s)?:\\/\\/.+/', $str)) {
return mb_detect_encoding($string, array('ASCII','GB2312','GBK','UTF-8')); return true;
} }
/** return false;
* 递归转换数组值得编码格式 }
* @param array $arr
* @param string $to_encoding
* @param string $from_encoding
* @return array
*/
private function array_convert_encoding($arr,$to_encoding,$from_encoding)
{
if(!is_array($arr))return $arr;
foreach ($arr as $key => $value) {
if (is_array($value)) {
$arr[$key] = $this->array_convert_encoding($value,$to_encoding,$from_encoding);
}else{
$arr[$key] = mb_convert_encoding($value, $to_encoding,$from_encoding);
}
}
return $arr;
}
/**
* 简单的判断一下参数是否为一个URL链接
* @param string $str
* @return boolean
*/
private function isURL($str)
{
if(preg_match('/^http(s)?:\/\/.+/', $str))
{
return true;
}
return false;
}
} }