优化QL并新增一些demo
This commit is contained in:
parent
6f973149e4
commit
93b7ca0b7d
@ -7,7 +7,7 @@
|
|||||||
* @author Jaeger
|
* @author Jaeger
|
||||||
* @email 734708094@qq.com
|
* @email 734708094@qq.com
|
||||||
* @link http://git.oschina.net/jae/QueryList
|
* @link http://git.oschina.net/jae/QueryList
|
||||||
* @version 2.2.0
|
* @version 2.2.1
|
||||||
*
|
*
|
||||||
* @example
|
* @example
|
||||||
*
|
*
|
||||||
@ -55,6 +55,8 @@ class QueryList
|
|||||||
private $outputEncoding;
|
private $outputEncoding;
|
||||||
private $htmlEncoding;
|
private $htmlEncoding;
|
||||||
private static $ql;
|
private static $ql;
|
||||||
|
private function __construct() {
|
||||||
|
}
|
||||||
/**
|
/**
|
||||||
* 静态方法,访问入口
|
* 静态方法,访问入口
|
||||||
* @param string $page 要抓取的网页URL地址(支持https);或者是html源代码
|
* @param string $page 要抓取的网页URL地址(支持https);或者是html源代码
|
||||||
@ -119,7 +121,7 @@ class QueryList
|
|||||||
}
|
}
|
||||||
//获取编码格式
|
//获取编码格式
|
||||||
$this->htmlEncoding = $this->_getEncode($this->html);
|
$this->htmlEncoding = $this->_getEncode($this->html);
|
||||||
$this->html = $this->_removeTags($this->html,array('script','style'));
|
// $this->html = $this->_removeTags($this->html,array('script','style'));
|
||||||
if (!empty($regArr)) {
|
if (!empty($regArr)) {
|
||||||
$this->regArr = $regArr;
|
$this->regArr = $regArr;
|
||||||
$this->regRange = $regRange;
|
$this->regRange = $regRange;
|
||||||
|
@ -1,17 +1,20 @@
|
|||||||
<?php
|
<?php
|
||||||
require '../QueryList.class.php';
|
require '../QueryList.class.php';
|
||||||
|
header('Content-type:text/html;charset=utf-8');
|
||||||
//采集OSC的代码分享列表,标题 链接 作者
|
//采集OSC的代码分享列表,标题 链接 作者
|
||||||
$url = "http://www.oschina.net/code/list";
|
$url = "http://www.oschina.net/code/list";
|
||||||
$reg = array("title"=>array(".code_title a:eq(0)","text"),"url"=>array(".code_title a:eq(0)","href"),"author"=>array("img","title"));
|
$reg = array("title"=>array(".code_title a:eq(0)","text"),"url"=>array(".code_title a:eq(0)","href"),"author"=>array("img","title"));
|
||||||
$rang = ".code_list li";
|
$rang = ".code_list li";
|
||||||
//使用curl抓取源码并以GB2312编码格式输出
|
//使用curl抓取源码并以GBK编码格式输出
|
||||||
$hj = QueryList::Query($url,$reg,$rang,'curl','GB2312');
|
$hj = QueryList::Query($url,$reg,$rang,'curl','GBK');
|
||||||
$arr = $hj->jsonArr;
|
$arr = $hj->jsonArr;
|
||||||
echo "<pre>";
|
echo "<pre>";
|
||||||
print_r($arr);
|
print_r($arr);
|
||||||
echo "</pre><hr/>";
|
echo "</pre><hr/>";
|
||||||
|
|
||||||
|
echo '上面的是GBK格式输出的,而页面是UTF-8格式的,所以会看到输出是乱码!';
|
||||||
|
echo '<hr/>';
|
||||||
|
|
||||||
//如果还想采当前页面右边的 TOP40活跃贡献者 图像,得到JSON数据,可以这样写
|
//如果还想采当前页面右边的 TOP40活跃贡献者 图像,得到JSON数据,可以这样写
|
||||||
$reg = array("portrait"=>array(".hot_top img","src"));
|
$reg = array("portrait"=>array(".hot_top img","src"));
|
||||||
$hj->setQuery($reg);
|
$hj->setQuery($reg);
|
||||||
@ -26,3 +29,40 @@ $arr = $hj->jsonArr;
|
|||||||
echo "<pre>";
|
echo "<pre>";
|
||||||
print_r($arr);
|
print_r($arr);
|
||||||
echo "</pre><hr/>";
|
echo "</pre><hr/>";
|
||||||
|
|
||||||
|
//抓取网站基本信息
|
||||||
|
//设置规则
|
||||||
|
$reg = array(
|
||||||
|
//抓取网站keywords
|
||||||
|
"kw" => array("meta[name=keywords]","content"),
|
||||||
|
//抓取网站描述
|
||||||
|
"desc" => array("meta[name=description]","content"),
|
||||||
|
//抓取网站标题
|
||||||
|
"title" => array("title","text"),
|
||||||
|
//抓取网站第一个css link的链接
|
||||||
|
"css1" => array("link:eq(0)","href"),
|
||||||
|
//抓取网站第二个js link的链接
|
||||||
|
"js2" => array("script[src]:eq(1)","src")
|
||||||
|
);
|
||||||
|
//抓取的目标站
|
||||||
|
$url = 'http://x.44i.cc/';
|
||||||
|
//抓取
|
||||||
|
$data = QueryList::Query($url,$reg)->jsonArr;
|
||||||
|
print_r($data);
|
||||||
|
|
||||||
|
//下面单独演示回调函数的用法
|
||||||
|
//抓取网站keywords并分离每个关键词
|
||||||
|
$reg = array(
|
||||||
|
//抓取网站keywords,并调用自定义函数fun
|
||||||
|
"kw" => array("meta[name=keywords]","content",'','fun')
|
||||||
|
);
|
||||||
|
//自定义回调函数
|
||||||
|
function fun($content,$key){
|
||||||
|
//分离关键词
|
||||||
|
return explode(',', $content);
|
||||||
|
}
|
||||||
|
//抓取的目标站
|
||||||
|
$url = 'http://x.44i.cc/';
|
||||||
|
//抓取
|
||||||
|
$data = QueryList::Query($url,$reg)->jsonArr;
|
||||||
|
print_r($data);
|
Binary file not shown.
BIN
demo/一个完整的DEMO项目.zip
Normal file
BIN
demo/一个完整的DEMO项目.zip
Normal file
Binary file not shown.
Loading…
x
Reference in New Issue
Block a user