QueryList/QueryList.class.php
2014-04-03 11:04:18 +08:00

209 lines
6.3 KiB
PHP
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

<?php
/**
* Created by SublimeText2.
* User:JAE
* Date: 2014-1-7
* Blog:http://blog.jaekj.com
* QQ:734708094
* 通用列表采集类
* 版本:V1.6
*/
require_once 'phpQuery/phpQuery.php';
class QueryList{
private $pageURL;
private $regArr = array();
public $jsonArr = array();
private $regRange;
private $html;
private $output_encoding;
private $html_encoding;
/**
* 构造函数
* @param string $page 要抓取的网页URL地址(支持https);或者是html源代码
* @param array $regArr 【选择器数组】说明格式array("名称"=>array("选择器","类型"),.......),【类型】说明:值 "text" ,"html" ,"属性"
* @param string $regRange 【块选择器】:指 先按照规则 选出 几个大块 ,然后再分别再在块里面 进行相关的选择
* @param string $getHtmlWay 【源码获取方式】指是通过curl抓取源码还是通过file_get_contents抓取源码
* @param string $output_encoding【输出编码格式】指要以什么编码输出防止出现乱码,如果设置为 假值 则不改变原字符串编码
*/
function QueryList($page,$regArr=array(),$regRange='',$getHtmlWay="curl",$output_encoding="UTF-8")
{
$this->output_encoding = $output_encoding;
if($this->isURL($page))
{
$this->pageURL = $page;
if($getHtmlWay=="curl")
{
//为了能获取https://
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL,$this->pageURL);
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false);
curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, false);
curl_setopt($ch,CURLOPT_RETURNTRANSFER,1);
$this->html = curl_exec($ch);
curl_close($ch);
}else{
$this->html=file_get_contents($this->pageURL);
}
}else{
$this->html = $page;
}
//获取编码格式
$this->html_encoding = $this->get_encode($this->html);
if(!empty($regArr))
{
$this->regArr = $regArr;
$this->regRange = $regRange;
$this->getList();
}
}
function setQuery($regArr,$regRange='')
{
$this->jsonArr=array();
$this->regArr = $regArr;
$this->regRange = $regRange;
$this->getList();
}
private function getList()
{
$hobj = phpQuery::newDocumentHTML($this->html);
if(!empty($this->regRange))
{
$robj = pq($hobj)->find($this->regRange);
$i=0;
foreach($robj as $item)
{
while(list($key,$reg_value)=each($this->regArr))
{
$iobj = pq($item)->find($reg_value[0]);
switch($reg_value[1])
{
case 'text':
$this->jsonArr[$i][$key] = trim(pq($iobj)->text());
break;
case 'html':
$this->jsonArr[$i][$key] = trim(pq($iobj)->html());
break;
default:
$this->jsonArr[$i][$key] = pq($iobj)->attr($reg_value[1]);
break;
}
}
//重置数组指针
reset($this->regArr);
$i++;
}
}
else
{
while(list($key,$reg_value)=each($this->regArr))
{
$lobj = pq($hobj)->find($reg_value[0]);
$i=0;
foreach($lobj as $item)
{
switch($reg_value[1])
{
case 'text':
$this->jsonArr[$i++][$key] = trim(pq($item)->text());
break;
case 'html':
$this->jsonArr[$i++][$key] = trim(pq($item)->html());
break;
default:
$this->jsonArr[$i++][$key] = pq($item)->attr($reg_value[1]);
break;
}
}
}
}
if($this->output_encoding)
{
//编码转换
$this->jsonArr = $this->array_convert_encoding($this->jsonArr,$this->output_encoding,$this->html_encoding);
}
}
function getJSON()
{
return json_encode($this->jsonArr);
}
/**
* 获取文件编码
* @param $string
* @return string
*/
private function get_encode($string){
return mb_detect_encoding($string, array('ASCII','GB2312','GBK','UTF-8'));
}
/**
* 递归转换数组值得编码格式
* @param array $arr
* @param string $to_encoding
* @param string $from_encoding
* @return array
*/
private function array_convert_encoding($arr,$to_encoding,$from_encoding)
{
if(!is_array($arr))return $arr;
foreach ($arr as $key => $value) {
if (is_array($value)) {
$arr[$key] = $this->array_convert_encoding($value,$to_encoding,$from_encoding);
}else{
$arr[$key] = mb_convert_encoding($value, $to_encoding,$from_encoding);
}
}
return $arr;
}
/**
* 简单的判断一下参数是否为一个URL链接
* @param string $str
* @return boolean
*/
private function isURL($str)
{
if(preg_match('/^http(s)?:\/\/.+/', $str))
{
return true;
}
return false;
}
}
// $hj = new QueryList("http://www.baidu.com/s?rn=20&ie=utf-8&bs=love+me&f=8&rsv_bp=1&wd=%E4%B8%83%E9%87%8C%E9%A6%99&rsv_sug3=2&rsv_sug=0&rsv_sug1=2&rsv_sug4=111&inputT=2224",array("title"=>array("h3.t a","text"),"url"=>array("h3.t a","href"),"con"=>array("div.c-abstract","html")),"table.result");
//$hj=new QueryList("https://www.google.com.hk/search?filter=0&lr=&newwindow=1&safe=images&hl=en&as_qdr=all&q=QQ2014",array("hcon"=>array(".st","html")));
// print_r($hj->getJSON());
// print_r($hj->jsonArr);
// $hj = new QueryList('https://www.google.com.hk/search?filter=0&lr=&newwindow=1&safe=images&hl=en&as_qdr=all&q=QQ2014',array("url"=>array("h3.r a","href")));
// print_r($hj->jsonArr);
// $hj->setQuery(array("hcon"=>array("span.st","text")));
// print_r($hj->jsonArr);
/*
$ct = new QueryList("http://www.mianbao.com/cartoon/",array("title"=>array("a:eq(0)","text")),"ul.txt-list li","get");
print_r($ct->jsonArr);*/
/*$hj = new QueryList("http://t.sohu.com/jingxuan",array("con"=>array(".ugc","html")),"[id$=_con]","get","UTF-8");
print_r($hj->jsonArr);*/
/*$html = file_get_contents('http://www.baidu.com/s?rn=20&ie=utf-8&bs=love+me&f=8&rsv_bp=1&wd=%E4%B8%83%E9%87%8C%E9%A6%99&rsv_sug3=2&rsv_sug=0&rsv_sug1=2&rsv_sug4=111&inputT=2224');
$hj = new QueryList($html,array("title"=>array("h3.t a","text"),"url"=>array("h3.t a","href"),"con"=>array("div.c-abstract","html")),"table.result");
print_r($hj->jsonArr);*/
?>