QueryList/QueryList.class.php

<?php
	/**
	 * Created by SublimeText2.
	 * User:JAE
	 * Date: 2014-1-7
	 * Blog:http://blog.jaekj.com
	 * QQ:734708094
	 * 通用列表采集类
	 * 版本:V1.6
	 */
	require_once 'phpQuery/phpQuery.php';
	class QueryList{

		 private $pageURL;
		 private $regArr = array();
		 public $jsonArr = array();
		 private $regRange;
		 private $html;
		 private $output_encoding;
		 private $html_encoding;
		 /**
		  * 构造函数
		  * @param string $page            要抓取的网页URL地址(支持https);或者是html源代码
		  * @param array  $regArr         【选择器数组】说明：格式array("名称"=>array("选择器","类型"),.......),【类型】说明：值 "text" ,"html" ,"属性"
		  * @param string $regRange       【块选择器】：指 先按照规则 选出 几个大块 ，然后再分别再在块里面 进行相关的选择
		  * @param string $getHtmlWay     【源码获取方式】指是通过curl抓取源码，还是通过file_get_contents抓取源码
		  * @param string $output_encoding【输出编码格式】指要以什么编码输出，防止出现乱码,如果设置为 假值 则不改变原字符串编码
		  */
		 function QueryList($page,$regArr=array(),$regRange='',$getHtmlWay="curl",$output_encoding="UTF-8")
		 {

			$this->output_encoding = $output_encoding;
			if($this->isURL($page))
		 	{
				$this->pageURL = $page;
				if($getHtmlWay=="curl")
		        {
		       	 //为了能获取https://
				   $ch = curl_init();
					curl_setopt($ch, CURLOPT_URL,$this->pageURL);
					curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false);
					curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, false);
					curl_setopt($ch,CURLOPT_RETURNTRANSFER,1);
					$this->html = curl_exec($ch);
	               curl_close($ch);
		        }else{
		       		$this->html=file_get_contents($this->pageURL);
		        }
		 	}else{
		 		$this->html = $page;
		 	}

		   //获取编码格式
		   $this->html_encoding = $this->get_encode($this->html);


			 if(!empty($regArr))
			 {

				  $this->regArr = $regArr;
				 $this->regRange = $regRange;
				 $this->getList();
			 }

		 }
		 function setQuery($regArr,$regRange='')
		 {
			 $this->jsonArr=array();
			 $this->regArr = $regArr;
			 $this->regRange = $regRange;
			 $this->getList();
	     }
	    private function getList()
		 {

             $hobj = phpQuery::newDocumentHTML($this->html);

			 if(!empty($this->regRange))
			 {
			 $robj = pq($hobj)->find($this->regRange);

			  $i=0;
			 foreach($robj as $item)
			 {

				 while(list($key,$reg_value)=each($this->regArr))
				 {
					 $iobj = pq($item)->find($reg_value[0]);

					   switch($reg_value[1])
					   {
						   case 'text':
						   		 $this->jsonArr[$i][$key] = trim(pq($iobj)->text());
								 break;
				           case 'html':
						  		 $this->jsonArr[$i][$key] = trim(pq($iobj)->html());
								 break;
						   default:
						   		$this->jsonArr[$i][$key] = pq($iobj)->attr($reg_value[1]);
								break;

						}
				 }
				 //重置数组指针
				 reset($this->regArr);
				 $i++;
			  }
			 }
			 else
			 {
		    while(list($key,$reg_value)=each($this->regArr))
			 {
				$lobj = pq($hobj)->find($reg_value[0]);


				   $i=0;
				   foreach($lobj as $item)
				   {
					   switch($reg_value[1])
					   {
						   case 'text':
						   		 $this->jsonArr[$i++][$key] = trim(pq($item)->text());
								 break;
				           case 'html':
						  		 $this->jsonArr[$i++][$key] = trim(pq($item)->html());
								 break;
						   default:
						   		$this->jsonArr[$i++][$key] = pq($item)->attr($reg_value[1]);
								break;

						}


				   }


			 }
		   }
		   if($this->output_encoding)
		   {
			   //编码转换
			   $this->jsonArr = $this->array_convert_encoding($this->jsonArr,$this->output_encoding,$this->html_encoding);
		   }
		 }
		 function getJSON()
		 {
			 return json_encode($this->jsonArr);
		 }
		/**
		 * 获取文件编码
		 * @param $string
		 * @return string
		 */
		private function get_encode($string){
		    return mb_detect_encoding($string, array('ASCII','GB2312','GBK','UTF-8'));
		}
		/**
		 * 递归转换数组值得编码格式
		 * @param  array $arr
		 * @param  string $to_encoding
		 * @param  string $from_encoding
		 * @return array
		 */
		private function array_convert_encoding($arr,$to_encoding,$from_encoding)
		{
		    if(!is_array($arr))return $arr;
		    foreach ($arr as $key => $value) {
		        if (is_array($value)) {
		           $arr[$key] = $this->array_convert_encoding($value,$to_encoding,$from_encoding);
		        }else{
		           $arr[$key] = mb_convert_encoding($value, $to_encoding,$from_encoding);
		        }
		    }
		    return $arr;
		}
		/**
		 * 简单的判断一下参数是否为一个URL链接
		 * @param  string  $str
		 * @return boolean
		 */
		private function isURL($str)
		{
			if(preg_match('/^http(s)?:\/\/.+/', $str))
			{
				return true;
			}
			return false;
		}

}
  // $hj = new QueryList("http://www.baidu.com/s?rn=20&ie=utf-8&bs=love+me&f=8&rsv_bp=1&wd=%E4%B8%83%E9%87%8C%E9%A6%99&rsv_sug3=2&rsv_sug=0&rsv_sug1=2&rsv_sug4=111&inputT=2224",array("title"=>array("h3.t a","text"),"url"=>array("h3.t a","href"),"con"=>array("div.c-abstract","html")),"table.result");
	//$hj=new QueryList("https://www.google.com.hk/search?filter=0&lr=&newwindow=1&safe=images&hl=en&as_qdr=all&q=QQ2014",array("hcon"=>array(".st","html")));
	//  print_r($hj->getJSON());
	 // print_r($hj->jsonArr);
// $hj = new QueryList('https://www.google.com.hk/search?filter=0&lr=&newwindow=1&safe=images&hl=en&as_qdr=all&q=QQ2014',array("url"=>array("h3.r a","href")));
// print_r($hj->jsonArr);
// $hj->setQuery(array("hcon"=>array("span.st","text")));
// print_r($hj->jsonArr);
/*
$ct = new QueryList("http://www.mianbao.com/cartoon/",array("title"=>array("a:eq(0)","text")),"ul.txt-list li","get");
print_r($ct->jsonArr);*/

/*$hj = new QueryList("http://t.sohu.com/jingxuan",array("con"=>array(".ugc","html")),"[id$=_con]","get","UTF-8");
print_r($hj->jsonArr);*/

/*$html = file_get_contents('http://www.baidu.com/s?rn=20&ie=utf-8&bs=love+me&f=8&rsv_bp=1&wd=%E4%B8%83%E9%87%8C%E9%A6%99&rsv_sug3=2&rsv_sug=0&rsv_sug1=2&rsv_sug4=111&inputT=2224');
$hj = new QueryList($html,array("title"=>array("h3.t a","text"),"url"=>array("h3.t a","href"),"con"=>array("div.c-abstract","html")),"table.result");
print_r($hj->jsonArr);*/
?>