209 lines
6.3 KiB
PHP
209 lines
6.3 KiB
PHP
<?php
|
||
/**
|
||
* Created by SublimeText2.
|
||
* User:JAE
|
||
* Date: 2014-1-7
|
||
* Blog:http://blog.jaekj.com
|
||
* QQ:734708094
|
||
* 通用列表采集类
|
||
* 版本:V1.6
|
||
*/
|
||
require_once 'phpQuery/phpQuery.php';
|
||
class QueryList{
|
||
|
||
private $pageURL;
|
||
private $regArr = array();
|
||
public $jsonArr = array();
|
||
private $regRange;
|
||
private $html;
|
||
private $output_encoding;
|
||
private $html_encoding;
|
||
/**
|
||
* 构造函数
|
||
* @param string $page 要抓取的网页URL地址(支持https);或者是html源代码
|
||
* @param array $regArr 【选择器数组】说明:格式array("名称"=>array("选择器","类型"),.......),【类型】说明:值 "text" ,"html" ,"属性"
|
||
* @param string $regRange 【块选择器】:指 先按照规则 选出 几个大块 ,然后再分别再在块里面 进行相关的选择
|
||
* @param string $getHtmlWay 【源码获取方式】指是通过curl抓取源码,还是通过file_get_contents抓取源码
|
||
* @param string $output_encoding【输出编码格式】指要以什么编码输出,防止出现乱码,如果设置为 假值 则不改变原字符串编码
|
||
*/
|
||
function QueryList($page,$regArr=array(),$regRange='',$getHtmlWay="curl",$output_encoding="UTF-8")
|
||
{
|
||
|
||
$this->output_encoding = $output_encoding;
|
||
if($this->isURL($page))
|
||
{
|
||
$this->pageURL = $page;
|
||
if($getHtmlWay=="curl")
|
||
{
|
||
//为了能获取https://
|
||
$ch = curl_init();
|
||
curl_setopt($ch, CURLOPT_URL,$this->pageURL);
|
||
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false);
|
||
curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, false);
|
||
curl_setopt($ch,CURLOPT_RETURNTRANSFER,1);
|
||
$this->html = curl_exec($ch);
|
||
curl_close($ch);
|
||
}else{
|
||
$this->html=file_get_contents($this->pageURL);
|
||
}
|
||
}else{
|
||
$this->html = $page;
|
||
}
|
||
|
||
//获取编码格式
|
||
$this->html_encoding = $this->get_encode($this->html);
|
||
|
||
|
||
if(!empty($regArr))
|
||
{
|
||
|
||
$this->regArr = $regArr;
|
||
$this->regRange = $regRange;
|
||
$this->getList();
|
||
}
|
||
|
||
}
|
||
function setQuery($regArr,$regRange='')
|
||
{
|
||
$this->jsonArr=array();
|
||
$this->regArr = $regArr;
|
||
$this->regRange = $regRange;
|
||
$this->getList();
|
||
}
|
||
private function getList()
|
||
{
|
||
|
||
$hobj = phpQuery::newDocumentHTML($this->html);
|
||
|
||
if(!empty($this->regRange))
|
||
{
|
||
$robj = pq($hobj)->find($this->regRange);
|
||
|
||
$i=0;
|
||
foreach($robj as $item)
|
||
{
|
||
|
||
while(list($key,$reg_value)=each($this->regArr))
|
||
{
|
||
$iobj = pq($item)->find($reg_value[0]);
|
||
|
||
switch($reg_value[1])
|
||
{
|
||
case 'text':
|
||
$this->jsonArr[$i][$key] = trim(pq($iobj)->text());
|
||
break;
|
||
case 'html':
|
||
$this->jsonArr[$i][$key] = trim(pq($iobj)->html());
|
||
break;
|
||
default:
|
||
$this->jsonArr[$i][$key] = pq($iobj)->attr($reg_value[1]);
|
||
break;
|
||
|
||
}
|
||
}
|
||
//重置数组指针
|
||
reset($this->regArr);
|
||
$i++;
|
||
}
|
||
}
|
||
else
|
||
{
|
||
while(list($key,$reg_value)=each($this->regArr))
|
||
{
|
||
$lobj = pq($hobj)->find($reg_value[0]);
|
||
|
||
|
||
$i=0;
|
||
foreach($lobj as $item)
|
||
{
|
||
switch($reg_value[1])
|
||
{
|
||
case 'text':
|
||
$this->jsonArr[$i++][$key] = trim(pq($item)->text());
|
||
break;
|
||
case 'html':
|
||
$this->jsonArr[$i++][$key] = trim(pq($item)->html());
|
||
break;
|
||
default:
|
||
$this->jsonArr[$i++][$key] = pq($item)->attr($reg_value[1]);
|
||
break;
|
||
|
||
}
|
||
|
||
|
||
}
|
||
|
||
|
||
}
|
||
}
|
||
if($this->output_encoding)
|
||
{
|
||
//编码转换
|
||
$this->jsonArr = $this->array_convert_encoding($this->jsonArr,$this->output_encoding,$this->html_encoding);
|
||
}
|
||
}
|
||
function getJSON()
|
||
{
|
||
return json_encode($this->jsonArr);
|
||
}
|
||
/**
|
||
* 获取文件编码
|
||
* @param $string
|
||
* @return string
|
||
*/
|
||
private function get_encode($string){
|
||
return mb_detect_encoding($string, array('ASCII','GB2312','GBK','UTF-8'));
|
||
}
|
||
/**
|
||
* 递归转换数组值得编码格式
|
||
* @param array $arr
|
||
* @param string $to_encoding
|
||
* @param string $from_encoding
|
||
* @return array
|
||
*/
|
||
private function array_convert_encoding($arr,$to_encoding,$from_encoding)
|
||
{
|
||
if(!is_array($arr))return $arr;
|
||
foreach ($arr as $key => $value) {
|
||
if (is_array($value)) {
|
||
$arr[$key] = $this->array_convert_encoding($value,$to_encoding,$from_encoding);
|
||
}else{
|
||
$arr[$key] = mb_convert_encoding($value, $to_encoding,$from_encoding);
|
||
}
|
||
}
|
||
return $arr;
|
||
}
|
||
/**
|
||
* 简单的判断一下参数是否为一个URL链接
|
||
* @param string $str
|
||
* @return boolean
|
||
*/
|
||
private function isURL($str)
|
||
{
|
||
if(preg_match('/^http(s)?:\/\/.+/', $str))
|
||
{
|
||
return true;
|
||
}
|
||
return false;
|
||
}
|
||
|
||
}
|
||
// $hj = new QueryList("http://www.baidu.com/s?rn=20&ie=utf-8&bs=love+me&f=8&rsv_bp=1&wd=%E4%B8%83%E9%87%8C%E9%A6%99&rsv_sug3=2&rsv_sug=0&rsv_sug1=2&rsv_sug4=111&inputT=2224",array("title"=>array("h3.t a","text"),"url"=>array("h3.t a","href"),"con"=>array("div.c-abstract","html")),"table.result");
|
||
//$hj=new QueryList("https://www.google.com.hk/search?filter=0&lr=&newwindow=1&safe=images&hl=en&as_qdr=all&q=QQ2014",array("hcon"=>array(".st","html")));
|
||
// print_r($hj->getJSON());
|
||
// print_r($hj->jsonArr);
|
||
// $hj = new QueryList('https://www.google.com.hk/search?filter=0&lr=&newwindow=1&safe=images&hl=en&as_qdr=all&q=QQ2014',array("url"=>array("h3.r a","href")));
|
||
// print_r($hj->jsonArr);
|
||
// $hj->setQuery(array("hcon"=>array("span.st","text")));
|
||
// print_r($hj->jsonArr);
|
||
/*
|
||
$ct = new QueryList("http://www.mianbao.com/cartoon/",array("title"=>array("a:eq(0)","text")),"ul.txt-list li","get");
|
||
print_r($ct->jsonArr);*/
|
||
|
||
/*$hj = new QueryList("http://t.sohu.com/jingxuan",array("con"=>array(".ugc","html")),"[id$=_con]","get","UTF-8");
|
||
print_r($hj->jsonArr);*/
|
||
|
||
/*$html = file_get_contents('http://www.baidu.com/s?rn=20&ie=utf-8&bs=love+me&f=8&rsv_bp=1&wd=%E4%B8%83%E9%87%8C%E9%A6%99&rsv_sug3=2&rsv_sug=0&rsv_sug1=2&rsv_sug4=111&inputT=2224');
|
||
$hj = new QueryList($html,array("title"=>array("h3.t a","text"),"url"=>array("h3.t a","href"),"con"=>array("div.c-abstract","html")),"table.result");
|
||
print_r($hj->jsonArr);*/
|
||
?>
|