Merge branch 'dev'
This commit is contained in:
commit
ba6e6fb4c8
@ -12,7 +12,7 @@ use phpQuery,Exception,ReflectionClass;
|
||||
* @author Jaeger
|
||||
* @email 734708094@qq.com
|
||||
* @link http://git.oschina.net/jae/QueryList
|
||||
* @version 3.1.0
|
||||
* @version 3.1.1
|
||||
*
|
||||
* @example
|
||||
*
|
||||
@ -34,14 +34,14 @@ class HJ{
|
||||
}
|
||||
//获取CSDN文章页下面的文章标题和内容
|
||||
$url = 'http://www.csdn.net/article/2014-06-05/2820091-build-or-buy-a-mobile-game-backend';
|
||||
$reg = array(
|
||||
$rules = array(
|
||||
'title'=>array('h1','text','','callfun1'), //获取纯文本格式的标题,并调用回调函数1
|
||||
'summary'=>array('.summary','text','-input strong'), //获取纯文本的文章摘要,但保strong标签并去除input标签
|
||||
'content'=>array('.news_content','html','div a -.copyright'), //获取html格式的文章内容,但过滤掉div和a标签,去除类名为copyright的元素
|
||||
'callback'=>array('HJ','callfun2') //调用回调函数2作为全局回调函数
|
||||
);
|
||||
$rang = '.left';
|
||||
$hj = QueryList::Query($url,$reg,$rang);
|
||||
$hj = QueryList::Query($url,$rules,$rang);
|
||||
print_r($hj->data);
|
||||
|
||||
//继续获取右边相关热门文章列表的标题以及链接地址
|
||||
@ -53,9 +53,7 @@ echo $hj->getData();
|
||||
|
||||
class QueryList
|
||||
{
|
||||
private $regArr;
|
||||
public $data;
|
||||
private $regRange;
|
||||
public $html;
|
||||
private $pqHtml;
|
||||
private $outputEncoding = false;
|
||||
@ -68,21 +66,21 @@ class QueryList
|
||||
/**
|
||||
* 静态方法,访问入口
|
||||
* @param string $page 要抓取的网页URL地址(支持https);或者是html源代码
|
||||
* @param array $regArr 【选择器数组】说明:格式array("名称"=>array("选择器","类型"[,"标签过滤列表"][,"回调函数"]),.......[,"callback"=>"全局回调函数"]);
|
||||
* @param array $rules 【选择器数组】说明:格式array("名称"=>array("选择器","类型"[,"标签过滤列表"][,"回调函数"]),.......[,"callback"=>"全局回调函数"]);
|
||||
* 【选择器】说明:可以为任意的jQuery选择器语法
|
||||
* 【类型】说明:值 "text" ,"html" ,"HTML标签属性" ,
|
||||
* 【标签过滤列表】:可选,当标签名前面添加减号(-)时(此时标签可以为任意的元素选择器),表示移除该标签以及标签内容,否则当【类型】值为text时表示需要保留的HTML标签,为html时表示要过滤掉的HTML标签
|
||||
* 【标签过滤列表】:可选,要过滤的选择器名,多个用空格隔开,当标签名前面添加减号(-)时(此时标签可以为任意的元素选择器),表示移除该标签以及标签内容,否则当【类型】值为text时表示需要保留的HTML标签,为html时表示要过滤掉的HTML标签
|
||||
* 【回调函数】/【全局回调函数】:可选,字符串(函数名) 或 数组(array("类名","类的静态方法")),回调函数应有俩个参数,第一个参数是选择到的内容,第二个参数是选择器数组下标,回调函数会覆盖全局回调函数
|
||||
*
|
||||
* @param string $regRange 【块选择器】:指 先按照规则 选出 几个大块 ,然后再分别再在块里面 进行相关的选择
|
||||
* @param string $range 【块选择器】:指 先按照规则 选出 几个大块 ,然后再分别再在块里面 进行相关的选择
|
||||
* @param string $outputEncoding【输出编码格式】指要以什么编码输出(UTF-8,GB2312,.....),防止出现乱码,如果设置为 假值 则不改变原字符串编码
|
||||
* @param string $inputEncoding 【输入编码格式】明确指定输入的页面编码格式(UTF-8,GB2312,.....),防止出现乱码,如果设置为 假值 则自动识别
|
||||
* @param bool|false $removeHead 【是否移除页面头部区域】 乱码终极解决方案
|
||||
* @return mixed
|
||||
*/
|
||||
public static function Query($page,array $regArr, $regRange = '', $outputEncoding = null, $inputEncoding = null,$removeHead = false)
|
||||
public static function Query($page,array $rules, $range = '', $outputEncoding = null, $inputEncoding = null,$removeHead = false)
|
||||
{
|
||||
return self::getInstance()->_query($page, $regArr, $regRange, $outputEncoding, $inputEncoding,$removeHead);
|
||||
return self::getInstance()->_query($page, $rules, $range, $outputEncoding, $inputEncoding,$removeHead);
|
||||
}
|
||||
|
||||
/**
|
||||
@ -144,19 +142,19 @@ class QueryList
|
||||
|
||||
/**
|
||||
* 重新设置选择器
|
||||
* @param $regArr
|
||||
* @param string $regRange
|
||||
* @param $rules
|
||||
* @param string $range
|
||||
* @param string $outputEncoding
|
||||
* @param string $inputEncoding
|
||||
* @param bool|false $removeHead
|
||||
* @return QueryList
|
||||
*/
|
||||
public function setQuery(array $regArr, $regRange = '',$outputEncoding = null, $inputEncoding = null,$removeHead = false)
|
||||
public function setQuery(array $rules, $range = '',$outputEncoding = null, $inputEncoding = null,$removeHead = false)
|
||||
{
|
||||
return $this->_query($this->html,$regArr, $regRange, $outputEncoding, $inputEncoding,$removeHead);
|
||||
return $this->_query($this->html,$rules, $range, $outputEncoding, $inputEncoding,$removeHead);
|
||||
}
|
||||
|
||||
private function _query($page,array $regArr, $regRange, $outputEncoding, $inputEncoding,$removeHead)
|
||||
private function _query($page,array $rules, $range, $outputEncoding, $inputEncoding,$removeHead)
|
||||
{
|
||||
$this->data = array();
|
||||
$this->html = $this->_isURL($page)?$this->_request($page):$page;
|
||||
@ -170,8 +168,8 @@ class QueryList
|
||||
//获取编码格式
|
||||
$this->htmlEncoding = $this->inputEncoding?$this->inputEncoding:$this->_getEncode($this->html);
|
||||
// $this->html = $this->_removeTags($this->html,array('script','style'));
|
||||
$this->regArr = $regArr;
|
||||
$this->regRange = $regRange;
|
||||
$this->regArr = $rules;
|
||||
$this->regRange = $range;
|
||||
$this->_getList();
|
||||
return $this;
|
||||
}
|
||||
|
@ -5,7 +5,8 @@
|
||||
"homepage": "http://querylist.cc",
|
||||
"require": {
|
||||
"PHP":">=5.3.0",
|
||||
"jaeger/phpquery-single": "^0.9.5"
|
||||
"jaeger/phpquery-single": "^0.9.5",
|
||||
"jaeger/querylist-ext-request": "^1.0"
|
||||
},
|
||||
"license": "MIT",
|
||||
"authors": [
|
||||
|
Loading…
x
Reference in New Issue
Block a user