This commit is contained in:
JAE
2014-06-12 14:58:02 +08:00
parent 3b7078a1ac
commit 0261fe2f22
5 changed files with 102 additions and 28 deletions

View File

@@ -7,42 +7,93 @@
* @author Jaeger * @author Jaeger
* @email 734708094@qq.com * @email 734708094@qq.com
* @link http://git.oschina.net/jae/QueryList * @link http://git.oschina.net/jae/QueryList
* @version 1.6.1 * @version 2.0.0
*
* @example
*
//获取CSDN移动开发栏目下的文章列表标题
$hj = QueryList::Query('http://mobile.csdn.net/',array("title"=>array('.unit h1','text')));
print_r($hj->jsonArr);
//获取CSDN文章页下面的文章标题和内容
$url = 'http://www.csdn.net/article/2014-06-05/2820091-build-or-buy-a-mobile-game-backend';
$reg = array(
'title'=>array('h1','text'), //获取纯文本格式的标题
'summary'=>array('.summary','text','input strong'), //获取纯文本的文章摘要但保留input和strong标签
'content'=>array('.news_content','html','div a') //获取html格式的文章内容但过滤掉div和a标签
);
$rang = '.left';
$hj = QueryList::Query($url,$reg,$rang,'curl');
print_r($hj->jsonArr);
//继续获取右边相关热门文章列表的标题以及链接地址
$hj->setQuery(array('title'=>array('','text'),'url'=>array('a','href')),'#con_two_2 li');
//输出json数据
echo $hj->getJson();
*/ */
require 'phpQuery/phpQuery.php'; require 'phpQuery/phpQuery.php';
class QueryList class QueryList
{ {
private $pageURL;
private $regArr; private $regArr;
public $jsonArr; public $jsonArr;
private $regRange; private $regRange;
private $html; private $html;
private $outputEncoding; private $outputEncoding;
private $htmlEncoding; private $htmlEncoding;
private static $ql;
/** /**
* 构造函数 * 静态方法,访问入口
* @param string $page 要抓取的网页URL地址(支持https);或者是html源代码 * @param string $page 要抓取的网页URL地址(支持https);或者是html源代码
* @param array $regArr 【选择器数组】说明格式array("名称"=>array("选择器","类型"),.......),【类型】说明:值 "text" ,"html" ,"属性" * @param array $regArr 【选择器数组】说明格式array("名称"=>array("选择器","类型"[,"标签列表"]),.......),【类型】说明:值 "text" ,"html" ,"属性" ,【标签列表】:可选当【类型】值为text时表示需要保留的HTML标签为html时表示要过滤掉的HTML标签
* @param string $regRange 【块选择器】:指 先按照规则 选出 几个大块 ,然后再分别再在块里面 进行相关的选择 * @param string $regRange 【块选择器】:指 先按照规则 选出 几个大块 ,然后再分别再在块里面 进行相关的选择
* @param string $getHtmlWay 【源码获取方式】指是通过curl抓取源码还是通过file_get_contents抓取源码 * @param string $getHtmlWay 【源码获取方式】指是通过curl抓取源码还是通过file_get_contents抓取源码
* @param string $outputEncoding【输出编码格式】指要以什么编码输出(UTF-8,GB2312,.....),防止出现乱码,如果设置为 假值 则不改变原字符串编码 * @param string $outputEncoding【输出编码格式】指要以什么编码输出(UTF-8,GB2312,.....),防止出现乱码,如果设置为 假值 则不改变原字符串编码
*/ */
public function __construct($page, $regArr, $regRange = '', $getHtmlWay = 'curl', $outputEncoding = false) public static function Query($page, $regArr, $regRange = '', $getHtmlWay = 'curl', $outputEncoding = false)
{ {
if(!(self::$ql instanceof self))
{
self::$ql = new self();
}
self::$ql->_query($page, $regArr, $regRange, $getHtmlWay, $outputEncoding);
return self::$ql;
}
/**
* 重新设置选择器
* @param array $regArr 选择器数组
* @param string $regRange 块选择器
*/
public function setQuery($regArr, $regRange = '')
{
$this->jsonArr = array();
$this->regArr = $regArr;
$this->regRange = $regRange;
$this->_getList();
}
/**
* 得到JSON结构的结果
* @return string
*/
public function getJSON()
{
return json_encode($this->jsonArr);
}
private function _query($page, $regArr, $regRange, $getHtmlWay, $outputEncoding)
{
$this->jsonArr = array();
$this->outputEncoding = $outputEncoding; $this->outputEncoding = $outputEncoding;
if ($this->_isURL($page)) { if ($this->_isURL($page)) {
$this->pageURL = $page;
if ($getHtmlWay == 'curl') { if ($getHtmlWay == 'curl') {
//为了能获取https:// //为了能获取https://
$ch = curl_init(); $ch = curl_init();
curl_setopt($ch, CURLOPT_URL, $this->pageURL); curl_setopt($ch, CURLOPT_URL, $page);
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false); curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false);
curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, false); curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, false);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1); curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
$this->html = curl_exec($ch); $this->html = curl_exec($ch);
curl_close($ch); curl_close($ch);
} else { } else {
$this->html = file_get_contents($this->pageURL); $this->html = file_get_contents($page);
} }
} else { } else {
$this->html = $page; $this->html = $page;
@@ -55,13 +106,6 @@ class QueryList
$this->_getList(); $this->_getList();
} }
} }
public function setQuery($regArr, $regRange = '')
{
$this->jsonArr = array();
$this->regArr = $regArr;
$this->regRange = $regRange;
$this->_getList();
}
private function _getList() private function _getList()
{ {
$hobj = phpQuery::newDocumentHTML($this->html); $hobj = phpQuery::newDocumentHTML($this->html);
@@ -70,13 +114,14 @@ class QueryList
$i = 0; $i = 0;
foreach ($robj as $item) { foreach ($robj as $item) {
while (list($key, $reg_value) = each($this->regArr)) { while (list($key, $reg_value) = each($this->regArr)) {
$tags = isset($reg_value[2])?$reg_value[2]:'';
$iobj = pq($item)->find($reg_value[0]); $iobj = pq($item)->find($reg_value[0]);
switch ($reg_value[1]) { switch ($reg_value[1]) {
case 'text': case 'text':
$this->jsonArr[$i][$key] = trim(pq($iobj)->text()); $this->jsonArr[$i][$key] = $this->_allowTags(pq($iobj)->html(),$tags);
break; break;
case 'html': case 'html':
$this->jsonArr[$i][$key] = trim(pq($iobj)->html()); $this->jsonArr[$i][$key] = $this->_stripTags(pq($iobj)->html(),$tags);
break; break;
default: default:
$this->jsonArr[$i][$key] = pq($iobj)->attr($reg_value[1]); $this->jsonArr[$i][$key] = pq($iobj)->attr($reg_value[1]);
@@ -89,15 +134,16 @@ class QueryList
} }
} else { } else {
while (list($key, $reg_value) = each($this->regArr)) { while (list($key, $reg_value) = each($this->regArr)) {
$tags = isset($reg_value[2])?$reg_value[2]:'';
$lobj = pq($hobj)->find($reg_value[0]); $lobj = pq($hobj)->find($reg_value[0]);
$i = 0; $i = 0;
foreach ($lobj as $item) { foreach ($lobj as $item) {
switch ($reg_value[1]) { switch ($reg_value[1]) {
case 'text': case 'text':
$this->jsonArr[$i++][$key] = trim(pq($item)->text()); $this->jsonArr[$i++][$key] = $this->_allowTags(pq($item)->html(),$tags);
break; break;
case 'html': case 'html':
$this->jsonArr[$i++][$key] = trim(pq($item)->html()); $this->jsonArr[$i++][$key] = $this->_stripTags(pq($item)->html(),$tags);
break; break;
default: default:
$this->jsonArr[$i++][$key] = pq($item)->attr($reg_value[1]); $this->jsonArr[$i++][$key] = pq($item)->attr($reg_value[1]);
@@ -111,10 +157,6 @@ class QueryList
$this->jsonArr = $this->_arrayConvertEncoding($this->jsonArr, $this->outputEncoding, $this->htmlEncoding); $this->jsonArr = $this->_arrayConvertEncoding($this->jsonArr, $this->outputEncoding, $this->htmlEncoding);
} }
} }
public function getJSON()
{
return json_encode($this->jsonArr);
}
/** /**
* 获取文件编码 * 获取文件编码
* @param $string * @param $string
@@ -157,4 +199,36 @@ class QueryList
} }
return false; return false;
} }
/**
* 去除特定的html标签
* @param string $html
* @param string $tags 多个标签名之间用空格隔开
* @return string
*/
private function _stripTags($html,$tags)
{
$tagsArr = preg_split("/\s+/",$tags,-1,PREG_SPLIT_NO_EMPTY);
$p = array();
foreach ($tagsArr as $tag) {
$p[]="/(<(?:\/".$tag."|".$tag.")[^>]*>)/i";
} }
$html = preg_replace($p,"",trim($html));
return $html;
}
/**
* 保留特定的html标签
* @param string $html
* @param string $tags 多个标签名之间用空格隔开
* @return string
*/
private function _allowTags($html,$tags)
{
$tagsArr = preg_split("/\s+/",$tags,-1,PREG_SPLIT_NO_EMPTY);
$allow = '';
foreach ($tagsArr as $tag) {
$allow .= "<$tag> ";
}
return strip_tags(trim($html),$allow);
}
}

View File

@@ -1 +1 @@

View File

@@ -48,7 +48,7 @@ require '../QueryList.class.php';
$reg_znum='/([\d,]+) result(s)?/'; $reg_znum='/([\d,]+) result(s)?/';
$getHtmlWay = 'curl'; $getHtmlWay = 'curl';
} }
$searcherObj = new QueryList($url,$this->regArr,$this->regRange,$getHtmlWay,false); $searcherObj = QueryList::Query($url,$this->regArr,$this->regRange,$getHtmlWay,false);
for($i=0;$i<count($searcherObj->jsonArr);$i++) for($i=0;$i<count($searcherObj->jsonArr);$i++)
{ {
if($this->searcher=='baidu') if($this->searcher=='baidu')

View File

@@ -6,7 +6,7 @@ $url = "http://www.oschina.net/code/list";
$reg = array("title"=>array(".code_title a:eq(0)","text"),"url"=>array(".code_title a:eq(0)","href"),"author"=>array("img","title")); $reg = array("title"=>array(".code_title a:eq(0)","text"),"url"=>array(".code_title a:eq(0)","href"),"author"=>array("img","title"));
$rang = ".code_list li"; $rang = ".code_list li";
//使用curl抓取源码并以GB2312编码格式输出 //使用curl抓取源码并以GB2312编码格式输出
$hj = new QueryList($url,$reg,$rang,'curl','GB2312'); $hj = QueryList::Query($url,$reg,$rang,'curl','GB2312');
$arr = $hj->jsonArr; $arr = $hj->jsonArr;
echo "<pre>"; echo "<pre>";
print_r($arr); print_r($arr);
@@ -21,7 +21,7 @@ echo $json . "<hr/>";
//采OSC内容页内容 //采OSC内容页内容
$url = "http://www.oschina.net/code/snippet_186288_23816"; $url = "http://www.oschina.net/code/snippet_186288_23816";
$reg = array("title"=>array(".QTitle h1","text"),"con"=>array(".Content","html")); $reg = array("title"=>array(".QTitle h1","text"),"con"=>array(".Content","html"));
$hj = new QueryList($url,$reg); $hj = QueryList::Query($url,$reg);
$arr = $hj->jsonArr; $arr = $hj->jsonArr;
echo "<pre>"; echo "<pre>";
print_r($arr); print_r($arr);

BIN
demo/thanks.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 8.6 KiB