Compare commits
114 Commits
Author | SHA1 | Date | |
---|---|---|---|
|
894fb4344e | ||
|
e4fc716acd | ||
|
39dc0ca9c6 | ||
|
ef0a2efd4f | ||
|
5953daac54 | ||
|
465c6aefc7 | ||
|
92cb319d44 | ||
|
cbf3e0fcad | ||
|
cfa2d94a79 | ||
|
47a444bf9e | ||
|
85903fa9b5 | ||
|
e527c637c7 | ||
|
f0a9798925 | ||
|
faea883c6f | ||
|
c16826a573 | ||
|
1492751f98 | ||
|
b7954b9aef | ||
|
b3d84cf057 | ||
|
52bbdeae14 | ||
|
25b2dbdc86 | ||
|
02c2b125d8 | ||
|
fc8b701ef2 | ||
|
75e436c73f | ||
|
aa90e5a21d | ||
|
dd9af6881d | ||
|
b07d4bfc74 | ||
|
8c1614c4c3 | ||
|
b387ef5bb0 | ||
|
67f0052c5d | ||
|
7c86f82527 | ||
|
6ee6a26aee | ||
|
116f19da65 | ||
|
67cbd0f473 | ||
|
3eb26451c6 | ||
|
a76ecb4258 | ||
|
46f564bc8b | ||
|
df9e3bbf19 | ||
|
0c85eed7ef | ||
|
df521923ac | ||
|
a779ef71f3 | ||
|
c32736bd9e | ||
|
661bc3168d | ||
|
6d182ff061 | ||
|
1c2e3f4adf | ||
|
1d73895981 | ||
|
03e6a955bf | ||
|
72a7543da3 | ||
|
9d04003d73 | ||
|
31ec950cdc | ||
|
18bc6daea4 | ||
|
f2c6ce7385 | ||
|
c0ed870dc8 | ||
|
a4d0087e47 | ||
|
a0f7b9aa3e | ||
|
d812c47ede | ||
|
47c0f37233 | ||
|
967ef10f23 | ||
|
c82eb3c557 | ||
|
f68cc2e218 | ||
|
684e52c70e | ||
|
777d837f18 | ||
|
6e9a202ac2 | ||
|
e885eece26 | ||
|
aeeec5367e | ||
|
c42a7b1766 | ||
|
a3a830a744 | ||
|
7381ec21d3 | ||
|
95102a5ce2 | ||
|
520195c929 | ||
|
75799decc3 | ||
|
33c574cdb9 | ||
|
47a777789b | ||
|
ad8ce44572 | ||
|
1c54d63993 | ||
|
59d48911fd | ||
|
5ed0921d17 | ||
|
fcdc5a16db | ||
|
a8a438edbe | ||
|
bd58352117 | ||
|
c3f8a48357 | ||
|
006e24a117 | ||
|
042993311f | ||
|
b6c21b653a | ||
|
5422168c98 | ||
|
624f071a0d | ||
|
042c10cdea | ||
|
2013e4d2b0 | ||
|
ad9b493fc0 | ||
|
43d8f71678 | ||
|
02fe5a7f9e | ||
|
8bd07f5fbb | ||
|
02c4c93ee5 | ||
|
0fafaafa7b | ||
|
fe749f08c2 | ||
|
e3576ce407 | ||
|
1a7864dcf8 | ||
|
5cc049992b | ||
|
967f2d95cd | ||
|
7f6b6e279e | ||
|
198385e336 | ||
|
26d6cf5e43 | ||
|
700d56db49 | ||
|
1691ddf3ee | ||
|
cbae16c6a4 | ||
|
66c4ef8c4f | ||
|
330c71778f | ||
|
1185ad399f | ||
|
b3290d2484 | ||
|
8e4cf456f2 | ||
|
f006e751ef | ||
|
64884ee72f | ||
|
777738adc3 | ||
|
4a003e5490 | ||
|
fbea1aaa94 |
12
.github/FUNDING.yml
vendored
Normal file
12
.github/FUNDING.yml
vendored
Normal file
@@ -0,0 +1,12 @@
|
||||
# These are supported funding model platforms
|
||||
|
||||
github: # Replace with up to 4 GitHub Sponsors-enabled usernames e.g., [user1, user2]
|
||||
patreon: # Replace with a single Patreon username
|
||||
open_collective: querylist # Replace with a single Open Collective username
|
||||
ko_fi: # Replace with a single Ko-fi username
|
||||
tidelift: # Replace with a single Tidelift platform-name/package-name e.g., npm/babel
|
||||
community_bridge: # Replace with a single Community Bridge project-name e.g., cloud-foundry
|
||||
liberapay: # Replace with a single Liberapay username
|
||||
issuehunt: # Replace with a single IssueHunt username
|
||||
otechie: # Replace with a single Otechie username
|
||||
custom: # Replace with up to 4 custom sponsorship URLs e.g., ['link1', 'link2']
|
4
.gitignore
vendored
4
.gitignore
vendored
@@ -1,3 +1,5 @@
|
||||
/vendor/
|
||||
.idea/
|
||||
composer.lock
|
||||
composer.lock
|
||||
.DS_Store
|
||||
*.cache
|
423
QueryList.php
423
QueryList.php
@@ -1,423 +0,0 @@
|
||||
<?php
|
||||
|
||||
namespace QL;
|
||||
|
||||
use phpQuery,Exception,ReflectionClass;
|
||||
|
||||
/**
|
||||
* QueryList
|
||||
*
|
||||
* 一个基于phpQuery的通用列表采集类
|
||||
*
|
||||
* @author Jaeger
|
||||
* @email 734708094@qq.com
|
||||
* @link http://git.oschina.net/jae/QueryList
|
||||
* @version 3.1.1
|
||||
*
|
||||
* @example
|
||||
*
|
||||
//获取CSDN移动开发栏目下的文章列表标题
|
||||
$hj = QueryList::Query('http://mobile.csdn.net/',array("title"=>array('.unit h1','text')));
|
||||
print_r($hj->data);
|
||||
|
||||
//回调函数1
|
||||
function callfun1($content,$key)
|
||||
{
|
||||
return '回调函数1:'.$key.'-'.$content;
|
||||
}
|
||||
class HJ{
|
||||
//回调函数2
|
||||
static public function callfun2($content,$key)
|
||||
{
|
||||
return '回调函数2:'.$key.'-'.$content;
|
||||
}
|
||||
}
|
||||
//获取CSDN文章页下面的文章标题和内容
|
||||
$url = 'http://www.csdn.net/article/2014-06-05/2820091-build-or-buy-a-mobile-game-backend';
|
||||
$rules = array(
|
||||
'title'=>array('h1','text','','callfun1'), //获取纯文本格式的标题,并调用回调函数1
|
||||
'summary'=>array('.summary','text','-input strong'), //获取纯文本的文章摘要,但保strong标签并去除input标签
|
||||
'content'=>array('.news_content','html','div a -.copyright'), //获取html格式的文章内容,但过滤掉div和a标签,去除类名为copyright的元素
|
||||
'callback'=>array('HJ','callfun2') //调用回调函数2作为全局回调函数
|
||||
);
|
||||
$rang = '.left';
|
||||
$hj = QueryList::Query($url,$rules,$rang);
|
||||
print_r($hj->data);
|
||||
|
||||
//继续获取右边相关热门文章列表的标题以及链接地址
|
||||
$hj->setQuery(array('title'=>array('','text'),'url'=>array('a','href')),'#con_two_2 li');
|
||||
//输出数据
|
||||
echo $hj->getData();
|
||||
|
||||
*/
|
||||
|
||||
class QueryList
|
||||
{
|
||||
public $data;
|
||||
public $html;
|
||||
private $pqHtml;
|
||||
private $outputEncoding = false;
|
||||
private $inputEncoding = false;
|
||||
private $htmlEncoding;
|
||||
public static $instances;
|
||||
|
||||
public function __construct() {
|
||||
}
|
||||
/**
|
||||
* 静态方法,访问入口
|
||||
* @param string $page 要抓取的网页URL地址(支持https);或者是html源代码
|
||||
* @param array $rules 【选择器数组】说明:格式array("名称"=>array("选择器","类型"[,"标签过滤列表"][,"回调函数"]),.......[,"callback"=>"全局回调函数"]);
|
||||
* 【选择器】说明:可以为任意的jQuery选择器语法
|
||||
* 【类型】说明:值 "text" ,"html" ,"HTML标签属性" ,
|
||||
* 【标签过滤列表】:可选,要过滤的选择器名,多个用空格隔开,当标签名前面添加减号(-)时(此时标签可以为任意的元素选择器),表示移除该标签以及标签内容,否则当【类型】值为text时表示需要保留的HTML标签,为html时表示要过滤掉的HTML标签
|
||||
* 【回调函数】/【全局回调函数】:可选,字符串(函数名) 或 数组(array("类名","类的静态方法")),回调函数应有俩个参数,第一个参数是选择到的内容,第二个参数是选择器数组下标,回调函数会覆盖全局回调函数
|
||||
*
|
||||
* @param string $range 【块选择器】:指 先按照规则 选出 几个大块 ,然后再分别再在块里面 进行相关的选择
|
||||
* @param string $outputEncoding【输出编码格式】指要以什么编码输出(UTF-8,GB2312,.....),防止出现乱码,如果设置为 假值 则不改变原字符串编码
|
||||
* @param string $inputEncoding 【输入编码格式】明确指定输入的页面编码格式(UTF-8,GB2312,.....),防止出现乱码,如果设置为 假值 则自动识别
|
||||
* @param bool|false $removeHead 【是否移除页面头部区域】 乱码终极解决方案
|
||||
* @return mixed
|
||||
*/
|
||||
public static function Query($page,array $rules, $range = '', $outputEncoding = null, $inputEncoding = null,$removeHead = false)
|
||||
{
|
||||
return self::getInstance()->_query($page, $rules, $range, $outputEncoding, $inputEncoding,$removeHead);
|
||||
}
|
||||
|
||||
/**
|
||||
* 运行QueryList扩展
|
||||
* @param $class
|
||||
* @param array $args
|
||||
* @return mixed
|
||||
* @throws Exception
|
||||
*/
|
||||
public static function run($class,$args = array())
|
||||
{
|
||||
$extension = self::getInstance("QL\\Ext\\{$class}");
|
||||
return $extension->run($args);
|
||||
}
|
||||
|
||||
/**
|
||||
* 获取任意实例
|
||||
* @return mixed
|
||||
* @throws Exception
|
||||
*/
|
||||
public static function getInstance()
|
||||
{
|
||||
$args = func_get_args();
|
||||
count($args) || $args = array(self::class);
|
||||
$key = md5(serialize($args));
|
||||
$className = array_shift($args);
|
||||
if(!class_exists($className)) {
|
||||
throw new Exception("no class {$className}");
|
||||
}
|
||||
if(!isset(self::$instances[$key])) {
|
||||
$rc = new ReflectionClass($className);
|
||||
self::$instances[$key] = $rc->newInstanceArgs($args);
|
||||
}
|
||||
return self::$instances[$key];
|
||||
}
|
||||
|
||||
/**
|
||||
* 获取目标页面源码(主要用于调试)
|
||||
* @param bool|true $rel
|
||||
* @return string
|
||||
*/
|
||||
public function getHtml($rel = true)
|
||||
{
|
||||
return $rel?$this->qpHtml:$this->html;
|
||||
}
|
||||
|
||||
/**
|
||||
* 获取采集结果数据
|
||||
* @param callback $callback
|
||||
* @return array
|
||||
*/
|
||||
public function getData($callback = null)
|
||||
{
|
||||
if(is_callable($callback)){
|
||||
return array_map($callback,$this->data);
|
||||
}
|
||||
return $this->data;
|
||||
}
|
||||
|
||||
/**
|
||||
* 重新设置选择器
|
||||
* @param $rules
|
||||
* @param string $range
|
||||
* @param string $outputEncoding
|
||||
* @param string $inputEncoding
|
||||
* @param bool|false $removeHead
|
||||
* @return QueryList
|
||||
*/
|
||||
public function setQuery(array $rules, $range = '',$outputEncoding = null, $inputEncoding = null,$removeHead = false)
|
||||
{
|
||||
return $this->_query($this->html,$rules, $range, $outputEncoding, $inputEncoding,$removeHead);
|
||||
}
|
||||
|
||||
private function _query($page,array $rules, $range, $outputEncoding, $inputEncoding,$removeHead)
|
||||
{
|
||||
$this->data = array();
|
||||
$this->html = $this->_isURL($page)?$this->_request($page):$page;
|
||||
$outputEncoding && $this->outputEncoding = $outputEncoding;
|
||||
$inputEncoding && $this->inputEncoding = $inputEncoding;
|
||||
$removeHead && $this->html = $this->_removeHead($this->html);
|
||||
$this->pqHtml = '';
|
||||
if(empty($this->html)){
|
||||
trigger_error("The received content is empty!",E_USER_NOTICE);
|
||||
}
|
||||
//获取编码格式
|
||||
$this->htmlEncoding = $this->inputEncoding?$this->inputEncoding:$this->_getEncode($this->html);
|
||||
// $this->html = $this->_removeTags($this->html,array('script','style'));
|
||||
$this->regArr = $rules;
|
||||
$this->regRange = $range;
|
||||
$this->_getList();
|
||||
return $this;
|
||||
}
|
||||
|
||||
private function _getList()
|
||||
{
|
||||
$this->inputEncoding && phpQuery::$defaultCharset = $this->inputEncoding;
|
||||
$document = phpQuery::newDocumentHTML($this->html);
|
||||
$this->qpHtml = $document->htmlOuter();
|
||||
if (!empty($this->regRange)) {
|
||||
$robj = pq($document)->find($this->regRange);
|
||||
$i = 0;
|
||||
foreach ($robj as $item) {
|
||||
while (list($key, $reg_value) = each($this->regArr)) {
|
||||
if($key=='callback')continue;
|
||||
$tags = isset($reg_value[2])?$reg_value[2]:'';
|
||||
$iobj = pq($item)->find($reg_value[0]);
|
||||
|
||||
switch ($reg_value[1]) {
|
||||
case 'text':
|
||||
$this->data[$i][$key] = $this->_allowTags(pq($iobj)->html(),$tags);
|
||||
break;
|
||||
case 'html':
|
||||
$this->data[$i][$key] = $this->_stripTags(pq($iobj)->html(),$tags);
|
||||
break;
|
||||
default:
|
||||
$this->data[$i][$key] = pq($iobj)->attr($reg_value[1]);
|
||||
break;
|
||||
}
|
||||
|
||||
if(isset($reg_value[3])){
|
||||
$this->data[$i][$key] = call_user_func($reg_value[3],$this->data[$i][$key],$key);
|
||||
}else if(isset($this->regArr['callback'])){
|
||||
$this->data[$i][$key] = call_user_func($this->regArr['callback'],$this->data[$i][$key],$key);
|
||||
}
|
||||
}
|
||||
//重置数组指针
|
||||
reset($this->regArr);
|
||||
$i++;
|
||||
}
|
||||
} else {
|
||||
while (list($key, $reg_value) = each($this->regArr)) {
|
||||
if($key=='callback')continue;
|
||||
$document = phpQuery::newDocumentHTML($this->html);
|
||||
$tags = isset($reg_value[2])?$reg_value[2]:'';
|
||||
$lobj = pq($document)->find($reg_value[0]);
|
||||
$i = 0;
|
||||
foreach ($lobj as $item) {
|
||||
switch ($reg_value[1]) {
|
||||
case 'text':
|
||||
$this->data[$i][$key] = $this->_allowTags(pq($item)->html(),$tags);
|
||||
break;
|
||||
case 'html':
|
||||
$this->data[$i][$key] = $this->_stripTags(pq($item)->html(),$tags);
|
||||
break;
|
||||
default:
|
||||
$this->data[$i][$key] = pq($item)->attr($reg_value[1]);
|
||||
break;
|
||||
}
|
||||
|
||||
if(isset($reg_value[3])){
|
||||
$this->data[$i][$key] = call_user_func($reg_value[3],$this->data[$i][$key],$key);
|
||||
}else if(isset($this->regArr['callback'])){
|
||||
$this->data[$i][$key] = call_user_func($this->regArr['callback'],$this->data[$i][$key],$key);
|
||||
}
|
||||
|
||||
$i++;
|
||||
}
|
||||
}
|
||||
}
|
||||
if ($this->outputEncoding) {
|
||||
//编码转换
|
||||
$this->data = $this->_arrayConvertEncoding($this->data, $this->outputEncoding, $this->htmlEncoding);
|
||||
}
|
||||
phpQuery::$documents = array();
|
||||
}
|
||||
|
||||
/**
|
||||
* URL请求
|
||||
* @param $url
|
||||
* @return string
|
||||
*/
|
||||
private function _request($url)
|
||||
{
|
||||
if(function_exists('curl_init')){
|
||||
$ch = curl_init();
|
||||
curl_setopt($ch, CURLOPT_URL, $url);
|
||||
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false);
|
||||
curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, false);
|
||||
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);
|
||||
curl_setopt($ch, CURLOPT_AUTOREFERER, true);
|
||||
curl_setopt($ch, CURLOPT_REFERER, $url);
|
||||
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
|
||||
$result = curl_exec($ch);
|
||||
curl_close($ch);
|
||||
}elseif(version_compare(PHP_VERSION, '5.0.0')>=0){
|
||||
$opts = array(
|
||||
'http' => array(
|
||||
'header' => "Referer:{$url}"
|
||||
)
|
||||
);
|
||||
$result = file_get_contents($url,false,stream_context_create($opts));
|
||||
}else{
|
||||
$result = file_get_contents($url);
|
||||
}
|
||||
return $result;
|
||||
}
|
||||
|
||||
/**
|
||||
* 移除页面head区域代码
|
||||
* @param $html
|
||||
* @return mixed
|
||||
*/
|
||||
private function _removeHead($html)
|
||||
{
|
||||
return preg_replace('/<head.+?>.+<\/head>/is','<head></head>',$html);
|
||||
}
|
||||
|
||||
/**
|
||||
* 获取文件编码
|
||||
* @param $string
|
||||
* @return string
|
||||
*/
|
||||
private function _getEncode($string)
|
||||
{
|
||||
return mb_detect_encoding($string, array('ASCII', 'GB2312', 'GBK', 'UTF-8'));
|
||||
}
|
||||
|
||||
/**
|
||||
* 转换数组值的编码格式
|
||||
* @param array $arr
|
||||
* @param string $toEncoding
|
||||
* @param string $fromEncoding
|
||||
* @return array
|
||||
*/
|
||||
private function _arrayConvertEncoding($arr, $toEncoding, $fromEncoding)
|
||||
{
|
||||
eval('$arr = '.iconv($fromEncoding, $toEncoding.'//IGNORE', var_export($arr,TRUE)).';');
|
||||
return $arr;
|
||||
}
|
||||
|
||||
/**
|
||||
* 简单的判断一下参数是否为一个URL链接
|
||||
* @param string $str
|
||||
* @return boolean
|
||||
*/
|
||||
private function _isURL($str)
|
||||
{
|
||||
if (preg_match('/^http(s)?:\\/\\/.+/', $str)) {
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* 去除特定的html标签
|
||||
* @param string $html
|
||||
* @param string $tags_str 多个标签名之间用空格隔开
|
||||
* @return string
|
||||
*/
|
||||
private function _stripTags($html,$tags_str)
|
||||
{
|
||||
$tagsArr = $this->_tag($tags_str);
|
||||
$html = $this->_removeTags($html,$tagsArr[1]);
|
||||
$p = array();
|
||||
foreach ($tagsArr[0] as $tag) {
|
||||
$p[]="/(<(?:\/".$tag."|".$tag.")[^>]*>)/i";
|
||||
}
|
||||
$html = preg_replace($p,"",trim($html));
|
||||
return $html;
|
||||
}
|
||||
|
||||
/**
|
||||
* 保留特定的html标签
|
||||
* @param string $html
|
||||
* @param string $tags_str 多个标签名之间用空格隔开
|
||||
* @return string
|
||||
*/
|
||||
private function _allowTags($html,$tags_str)
|
||||
{
|
||||
$tagsArr = $this->_tag($tags_str);
|
||||
$html = $this->_removeTags($html,$tagsArr[1]);
|
||||
$allow = '';
|
||||
foreach ($tagsArr[0] as $tag) {
|
||||
$allow .= "<$tag> ";
|
||||
}
|
||||
return strip_tags(trim($html),$allow);
|
||||
}
|
||||
|
||||
private function _tag($tags_str)
|
||||
{
|
||||
$tagArr = preg_split("/\s+/",$tags_str,-1,PREG_SPLIT_NO_EMPTY);
|
||||
$tags = array(array(),array());
|
||||
foreach($tagArr as $tag)
|
||||
{
|
||||
if(preg_match('/-(.+)/', $tag,$arr))
|
||||
{
|
||||
array_push($tags[1], $arr[1]);
|
||||
}else{
|
||||
array_push($tags[0], $tag);
|
||||
}
|
||||
}
|
||||
return $tags;
|
||||
}
|
||||
|
||||
/**
|
||||
* 移除特定的html标签
|
||||
* @param string $html
|
||||
* @param array $tags 标签数组
|
||||
* @return string
|
||||
*/
|
||||
private function _removeTags($html,$tags)
|
||||
{
|
||||
$tag_str = '';
|
||||
if(count($tags))
|
||||
{
|
||||
foreach ($tags as $tag) {
|
||||
$tag_str .= $tag_str?','.$tag:$tag;
|
||||
}
|
||||
phpQuery::$defaultCharset = $this->inputEncoding?$this->inputEncoding:$this->htmlEncoding;
|
||||
$doc = phpQuery::newDocumentHTML($html);
|
||||
pq($doc)->find($tag_str)->remove();
|
||||
$html = pq($doc)->htmlOuter();
|
||||
$doc->unloadDocument();
|
||||
}
|
||||
return $html;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
class Autoload
|
||||
{
|
||||
public static function load($className)
|
||||
{
|
||||
$files = array(
|
||||
sprintf('%s/extensions/%s.php',__DIR__,$className),
|
||||
sprintf('%s/extensions/vendors/%s.php',__DIR__,$className)
|
||||
);
|
||||
foreach ($files as $file) {
|
||||
if(is_file($file)){
|
||||
require $file;
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
spl_autoload_register(array('Autoload','load'));
|
||||
|
||||
*/
|
309
README-ZH.md
Normal file
309
README-ZH.md
Normal file
@@ -0,0 +1,309 @@
|
||||
<p align="center">
|
||||
<img width="150" src="logo.png" alt="QueryList">
|
||||
<br>
|
||||
<br>
|
||||
</p>
|
||||
|
||||
# QueryList 简介
|
||||
`QueryList`是一套简洁、优雅、可扩展的PHP采集工具(爬虫),基于phpQuery。
|
||||
|
||||
## 特性
|
||||
- 拥有与jQuery完全相同的CSS3 DOM选择器
|
||||
- 拥有与jQuery完全相同的DOM操作API
|
||||
- 拥有通用的列表采集方案
|
||||
- 拥有强大的HTTP请求套件,轻松实现如:模拟登陆、伪造浏览器、HTTP代理等意复杂的网络请求
|
||||
- 拥有乱码解决方案
|
||||
- 拥有强大的内容过滤功能,可使用jQuey选择器来过滤内容
|
||||
- 拥有高度的模块化设计,扩展性强
|
||||
- 拥有富有表现力的API
|
||||
- 拥有高质量文档
|
||||
- 拥有丰富的插件
|
||||
- 拥有专业的问答社区和交流群
|
||||
|
||||
通过插件可以轻松实现诸如:
|
||||
- 多线程采集
|
||||
- 采集JavaScript动态渲染的页面 (PhantomJS/headless WebKit)
|
||||
- 图片本地化
|
||||
- 模拟浏览器行为,如:提交Form表单
|
||||
- 网络爬虫
|
||||
- .....
|
||||
|
||||
## 环境要求
|
||||
- PHP >= 7.1
|
||||
|
||||
> 如果你的PHP版本还停留在PHP5,或者不会使用Composer,你可以选择使用QueryList3,QueryList3支持php5.3以及手动安装。
|
||||
QueryList3 文档:http://v3.querylist.cc
|
||||
|
||||
## 安装
|
||||
通过Composer安装:
|
||||
```
|
||||
composer require jaeger/querylist
|
||||
```
|
||||
|
||||
## 使用
|
||||
|
||||
#### 元素操作
|
||||
- 采集「昵图网」所有图片地址
|
||||
|
||||
```php
|
||||
QueryList::get('http://www.nipic.com')->find('img')->attrs('src');
|
||||
```
|
||||
- 采集百度搜索结果
|
||||
|
||||
```php
|
||||
$ql = QueryList::get('http://www.baidu.com/s?wd=QueryList');
|
||||
|
||||
$ql->find('title')->text(); // 获取网站标题
|
||||
$ql->find('meta[name=keywords]')->content; // 获取网站头部关键词
|
||||
|
||||
$ql->find('h3>a')->texts(); //获取搜索结果标题列表
|
||||
$ql->find('h3>a')->attrs('href'); //获取搜索结果链接列表
|
||||
|
||||
$ql->find('img')->src; //获取第一张图片的链接地址
|
||||
$ql->find('img:eq(1)')->src; //获取第二张图片的链接地址
|
||||
$ql->find('img')->eq(2)->src; //获取第三张图片的链接地址
|
||||
// 遍历所有图片
|
||||
$ql->find('img')->map(function($img){
|
||||
echo $img->alt; //打印图片的alt属性
|
||||
});
|
||||
```
|
||||
- 更多用法
|
||||
|
||||
```php
|
||||
$ql->find('#head')->append('<div>追加内容</div>')->find('div')->htmls();
|
||||
$ql->find('.two')->children('img')->attrs('alt'); //获取class为two元素下的所有img孩子节点
|
||||
//遍历class为two元素下的所有孩子节点
|
||||
$data = $ql->find('.two')->children()->map(function ($item){
|
||||
//用is判断节点类型
|
||||
if($item->is('a')){
|
||||
return $item->text();
|
||||
}elseif($item->is('img'))
|
||||
{
|
||||
return $item->alt;
|
||||
}
|
||||
});
|
||||
|
||||
$ql->find('a')->attr('href', 'newVal')->removeClass('className')->html('newHtml')->...
|
||||
$ql->find('div > p')->add('div > ul')->filter(':has(a)')->find('p:first')->nextAll()->andSelf()->...
|
||||
$ql->find('div.old')->replaceWith( $ql->find('div.new')->clone())->appendTo('.trash')->prepend('Deleted')->...
|
||||
```
|
||||
#### 列表采集
|
||||
采集百度搜索结果列表的标题和链接:
|
||||
```php
|
||||
$data = QueryList::get('http://www.baidu.com/s?wd=QueryList')
|
||||
// 设置采集规则
|
||||
->rules([
|
||||
'title'=>array('h3','text'),
|
||||
'link'=>array('h3>a','href')
|
||||
])
|
||||
->query()->getData();
|
||||
|
||||
print_r($data->all());
|
||||
```
|
||||
采集结果:
|
||||
```
|
||||
Array
|
||||
(
|
||||
[0] => Array
|
||||
(
|
||||
[title] => QueryList|基于phpQuery的无比强大的PHP采集工具
|
||||
[link] => http://www.baidu.com/link?url=GU_YbDT2IHk4ns1tjG2I8_vjmH0SCJEAPuuZN
|
||||
)
|
||||
[1] => Array
|
||||
(
|
||||
[title] => PHP 用QueryList抓取网页内容 - wb145230 - 博客园
|
||||
[link] => http://www.baidu.com/link?url=zn0DXBnrvIF2ibRVW34KcRVFG1_bCdZvqvwIhUqiXaS
|
||||
)
|
||||
[2] => Array
|
||||
(
|
||||
[title] => 介绍- QueryList指导文档
|
||||
[link] => http://www.baidu.com/link?url=pSypvMovqS4v2sWeQo5fDBJ4EoYhXYi0Lxx
|
||||
)
|
||||
//...
|
||||
)
|
||||
```
|
||||
#### 编码转换
|
||||
```php
|
||||
// 输出编码:UTF-8,输入编码:GB2312
|
||||
QueryList::get('https://top.etao.com')->encoding('UTF-8','GB2312')->find('a')->texts();
|
||||
|
||||
// 输出编码:UTF-8,输入编码:自动识别
|
||||
QueryList::get('https://top.etao.com')->encoding('UTF-8')->find('a')->texts();
|
||||
```
|
||||
|
||||
#### HTTP网络操作(GuzzleHttp)
|
||||
- 携带cookie登录新浪微博
|
||||
```php
|
||||
//采集新浪微博需要登录才能访问的页面
|
||||
$ql = QueryList::get('http://weibo.com','param1=testvalue & params2=somevalue',[
|
||||
'headers' => [
|
||||
//填写从浏览器获取到的cookie
|
||||
'Cookie' => 'SINAGLOBAL=546064; wb_cmtLike_2112031=1; wvr=6;....'
|
||||
]
|
||||
]);
|
||||
//echo $ql->getHtml();
|
||||
echo $ql->find('title')->text();
|
||||
//输出: 我的首页 微博-随时随地发现新鲜事
|
||||
```
|
||||
- 使用Http代理
|
||||
```php
|
||||
$urlParams = ['param1' => 'testvalue','params2' => 'somevalue'];
|
||||
$opts = [
|
||||
// 设置http代理
|
||||
'proxy' => 'http://222.141.11.17:8118',
|
||||
//设置超时时间,单位:秒
|
||||
'timeout' => 30,
|
||||
// 伪造http头
|
||||
'headers' => [
|
||||
'Referer' => 'https://querylist.cc/',
|
||||
'User-Agent' => 'testing/1.0',
|
||||
'Accept' => 'application/json',
|
||||
'X-Foo' => ['Bar', 'Baz'],
|
||||
'Cookie' => 'abc=111;xxx=222'
|
||||
]
|
||||
];
|
||||
$ql->get('http://httpbin.org/get',$urlParams,$opts);
|
||||
// echo $ql->getHtml();
|
||||
```
|
||||
|
||||
- 模拟登录
|
||||
```php
|
||||
// 用post登录
|
||||
$ql = QueryList::post('http://xxxx.com/login',[
|
||||
'username' => 'admin',
|
||||
'password' => '123456'
|
||||
])->get('http://xxx.com/admin');
|
||||
//采集需要登录才能访问的页面
|
||||
$ql->get('http://xxx.com/admin/page');
|
||||
//echo $ql->getHtml();
|
||||
```
|
||||
|
||||
#### Form表单操作
|
||||
模拟登陆GitHub
|
||||
```php
|
||||
// 获取QueryList实例
|
||||
$ql = QueryList::getInstance();
|
||||
//获取到登录表单
|
||||
$form = $ql->get('https://github.com/login')->find('form');
|
||||
|
||||
//填写GitHub用户名和密码
|
||||
$form->find('input[name=login]')->val('your github username or email');
|
||||
$form->find('input[name=password]')->val('your github password');
|
||||
|
||||
//序列化表单数据
|
||||
$fromData = $form->serializeArray();
|
||||
$postData = [];
|
||||
foreach ($fromData as $item) {
|
||||
$postData[$item['name']] = $item['value'];
|
||||
}
|
||||
|
||||
//提交登录表单
|
||||
$actionUrl = 'https://github.com'.$form->attr('action');
|
||||
$ql->post($actionUrl,$postData);
|
||||
//判断登录是否成功
|
||||
// echo $ql->getHtml();
|
||||
$userName = $ql->find('.header-nav-current-user>.css-truncate-target')->text();
|
||||
if($userName)
|
||||
{
|
||||
echo '登录成功!欢迎你:'.$userName;
|
||||
}else{
|
||||
echo '登录失败!';
|
||||
}
|
||||
```
|
||||
#### Bind功能扩展
|
||||
自定义扩展一个`myHttp`方法:
|
||||
```php
|
||||
$ql = QueryList::getInstance();
|
||||
|
||||
//绑定一个myHttp方法到QueryList对象
|
||||
$ql->bind('myHttp',function ($url){
|
||||
// $this 为当前的QueryList对象
|
||||
$html = file_get_contents($url);
|
||||
$this->setHtml($html);
|
||||
return $this;
|
||||
});
|
||||
|
||||
//然后就可以通过注册的名字来调用
|
||||
$data = $ql->myHttp('https://toutiao.io')->find('h3 a')->texts();
|
||||
print_r($data->all());
|
||||
```
|
||||
或者把实现体封装到class,然后这样绑定:
|
||||
```php
|
||||
$ql->bind('myHttp',function ($url){
|
||||
return new MyHttp($this,$url);
|
||||
});
|
||||
```
|
||||
|
||||
#### 插件使用
|
||||
- 使用PhantomJS插件采集JavaScript动态渲染的页面:
|
||||
|
||||
```php
|
||||
// 安装时设置PhantomJS二进制文件路径
|
||||
$ql = QueryList::use(PhantomJs::class,'/usr/local/bin/phantomjs');
|
||||
|
||||
// 采集今日头条手机版
|
||||
$data = $ql->browser('https://m.toutiao.com')->find('p')->texts();
|
||||
print_r($data->all());
|
||||
|
||||
// 使用HTTP代理
|
||||
$ql->browser('https://m.toutiao.com',false,[
|
||||
'--proxy' => '192.168.1.42:8080',
|
||||
'--proxy-type' => 'http'
|
||||
])
|
||||
```
|
||||
|
||||
- 使用CURL多线程插件,多线程采集GitHub排行榜:
|
||||
|
||||
```php
|
||||
$ql = QueryList::use(CurlMulti::class);
|
||||
$ql->curlMulti([
|
||||
'https://github.com/trending/php',
|
||||
'https://github.com/trending/go',
|
||||
//.....more urls
|
||||
])
|
||||
// 每个任务成功完成调用此回调
|
||||
->success(function (QueryList $ql,CurlMulti $curl,$r){
|
||||
echo "Current url:{$r['info']['url']} \r\n";
|
||||
$data = $ql->find('h3 a')->texts();
|
||||
print_r($data->all());
|
||||
})
|
||||
// 每个任务失败回调
|
||||
->error(function ($errorInfo,CurlMulti $curl){
|
||||
echo "Current url:{$errorInfo['info']['url']} \r\n";
|
||||
print_r($errorInfo['error']);
|
||||
})
|
||||
->start([
|
||||
// 最大并发数
|
||||
'maxThread' => 10,
|
||||
// 错误重试次数
|
||||
'maxTry' => 3,
|
||||
]);
|
||||
|
||||
```
|
||||
|
||||
## 插件
|
||||
- [jae-jae/QueryList-PhantomJS](https://github.com/jae-jae/QueryList-PhantomJS): 使用PhantomJS采集JavaScript动态渲染的页面
|
||||
- [jae-jae/QueryList-CurlMulti](https://github.com/jae-jae/QueryList-CurlMulti) : Curl多线程采集
|
||||
- [jae-jae/QueryList-AbsoluteUrl](https://github.com/jae-jae/QueryList-AbsoluteUrl) : 转换URL相对路径到绝对路径
|
||||
- [jae-jae/QueryList-Rule-Google](https://github.com/jae-jae/QueryList-Rule-Google) : 谷歌搜索引擎
|
||||
- [jae-jae/QueryList-Rule-Baidu](https://github.com/jae-jae/QueryList-Rule-Baidu) : 百度搜索引擎
|
||||
|
||||
|
||||
查看更多的QueryList插件和基于QueryList的产品:[QueryList社区力量](https://github.com/jae-jae/QueryList-Community)
|
||||
|
||||
## 贡献
|
||||
欢迎为QueryList贡献代码。关于贡献插件可以查看:[QueryList插件贡献说明](https://github.com/jae-jae/QueryList-Community/blob/master/CONTRIBUTING.md)
|
||||
|
||||
## 寻求帮助?
|
||||
- QueryList主页: [http://querylist.cc](http://querylist.cc/)
|
||||
- QueryList文档: [http://doc.querylist.cc](http://doc.querylist.cc/)
|
||||
- QueryList问答:[http://wenda.querylist.cc](http://wenda.querylist.cc/)
|
||||
- QueryList交流QQ群:123266961 <a target="_blank" href="http://shang.qq.com/wpa/qunwpa?idkey=a1b248ae30b3f711bdab4f799df839300dc7fed54331177035efa0513da027f6"><img border="0" src="http://pub.idqqimg.com/wpa/images/group.png" alt="cafeEX" title="cafeEX"></a>
|
||||
- GitHub:https://github.com/jae-jae/QueryList
|
||||
- Git@OSC:http://git.oschina.net/jae/QueryList
|
||||
|
||||
## Author
|
||||
Jaeger <JaegerCode@gmail.com>
|
||||
|
||||
## Lisence
|
||||
QueryList is licensed under the license of MIT. See the LICENSE for more details.
|
315
README.md
315
README.md
@@ -1,11 +1,304 @@
|
||||
#QueryList交流社区: [http://querylist.cc/](http://querylist.cc/)
|
||||
#QueryList交流QQ群:123266961 <a target="_blank" href="http://shang.qq.com/wpa/qunwpa?idkey=a1b248ae30b3f711bdab4f799df839300dc7fed54331177035efa0513da027f6"><img border="0" src="http://pub.idqqimg.com/wpa/images/group.png" alt="╰☆邪恶 魔方☆" title="╰☆邪恶 魔方☆"></a>
|
||||
|
||||
#QueryList简介
|
||||
***
|
||||
QueryList是一个基于phpQuery的通用列表采集类,是一个简单、 灵活、强大的采集工具,采集任何复杂的页面 基本上就一句话就能搞定了。
|
||||
|
||||
#QueryList 使用
|
||||
```php
|
||||
//获取采集对象
|
||||
$hj = QueryList::Query('http://www.baidu.com/s?wd=QueryList',array('title'=>array('h3','text'),'link'=>array('h3>a','href')));
|
||||
<p align="center">
|
||||
<img width="150" src="logo.png" alt="QueryList">
|
||||
<br>
|
||||
<br>
|
||||
</p>
|
||||
|
||||
# QueryList
|
||||
`QueryList` is a simple, elegant, extensible PHP Web Scraper (crawler/spider) ,based on phpQuery.
|
||||
|
||||
[API Documentation](https://github.com/jae-jae/QueryList/wiki)
|
||||
|
||||
[中文文档](README-ZH.md)
|
||||
|
||||
## Features
|
||||
- Have the same CSS3 DOM selector as jQuery
|
||||
- Have the same DOM manipulation API as jQuery
|
||||
- Have a generic list crawling program
|
||||
- Have a strong HTTP request suite, easy to achieve such as: simulated landing, forged browser, HTTP proxy and other complex network requests
|
||||
- Have a messy code solution
|
||||
- Have powerful content filtering, you can use the jQuey selector to filter content
|
||||
- Has a high degree of modular design, scalability and strong
|
||||
- Have an expressive API
|
||||
- Has a wealth of plug-ins
|
||||
|
||||
Through plug-ins you can easily implement things like:
|
||||
- Multithreaded crawl
|
||||
- Crawl JavaScript dynamic rendering page (PhantomJS/headless WebKit)
|
||||
- Image downloads to local
|
||||
- Simulate browser behavior such as submitting Form forms
|
||||
- Web crawler
|
||||
- .....
|
||||
|
||||
## Requirements
|
||||
- PHP >= 7.1
|
||||
|
||||
## Installation
|
||||
By Composer installation:
|
||||
```
|
||||
composer require jaeger/querylist
|
||||
```
|
||||
|
||||
## Usage
|
||||
|
||||
#### DOM Traversal and Manipulation
|
||||
- Crawl「GitHub」all picture links
|
||||
|
||||
```php
|
||||
QueryList::get('https://github.com')->find('img')->attrs('src');
|
||||
```
|
||||
- Crawl Google search results
|
||||
|
||||
```php
|
||||
$ql = QueryList::get('https://www.google.co.jp/search?q=QueryList');
|
||||
|
||||
$ql->find('title')->text(); //The page title
|
||||
$ql->find('meta[name=keywords]')->content; //The page keywords
|
||||
|
||||
$ql->find('h3>a')->texts(); //Get a list of search results titles
|
||||
$ql->find('h3>a')->attrs('href'); //Get a list of search results links
|
||||
|
||||
$ql->find('img')->src; //Gets the link address of the first image
|
||||
$ql->find('img:eq(1)')->src; //Gets the link address of the second image
|
||||
$ql->find('img')->eq(2)->src; //Gets the link address of the third image
|
||||
// Loop all the images
|
||||
$ql->find('img')->map(function($img){
|
||||
echo $img->alt; //Print the alt attribute of the image
|
||||
});
|
||||
```
|
||||
- More usage
|
||||
|
||||
```php
|
||||
$ql->find('#head')->append('<div>Append content</div>')->find('div')->htmls();
|
||||
$ql->find('.two')->children('img')->attrs('alt'); // Get the class is the "two" element under all img child nodes
|
||||
// Loop class is the "two" element under all child nodes
|
||||
$data = $ql->find('.two')->children()->map(function ($item){
|
||||
// Use "is" to determine the node type
|
||||
if($item->is('a')){
|
||||
return $item->text();
|
||||
}elseif($item->is('img'))
|
||||
{
|
||||
return $item->alt;
|
||||
}
|
||||
});
|
||||
|
||||
$ql->find('a')->attr('href', 'newVal')->removeClass('className')->html('newHtml')->...
|
||||
$ql->find('div > p')->add('div > ul')->filter(':has(a)')->find('p:first')->nextAll()->andSelf()->...
|
||||
$ql->find('div.old')->replaceWith( $ql->find('div.new')->clone())->appendTo('.trash')->prepend('Deleted')->...
|
||||
```
|
||||
#### List crawl
|
||||
Crawl the title and link of the Google search results list:
|
||||
```php
|
||||
$data = QueryList::get('https://www.google.co.jp/search?q=QueryList')
|
||||
// Set the crawl rules
|
||||
->rules([
|
||||
'title'=>array('h3','text'),
|
||||
'link'=>array('h3>a','href')
|
||||
])
|
||||
->query()->getData();
|
||||
|
||||
print_r($data->all());
|
||||
```
|
||||
Results:
|
||||
```
|
||||
Array
|
||||
(
|
||||
[0] => Array
|
||||
(
|
||||
[title] => Angular - QueryList
|
||||
[link] => https://angular.io/api/core/QueryList
|
||||
)
|
||||
[1] => Array
|
||||
(
|
||||
[title] => QueryList | @angular/core - Angularリファレンス - Web Creative Park
|
||||
[link] => http://www.webcreativepark.net/angular/querylist/
|
||||
)
|
||||
[2] => Array
|
||||
(
|
||||
[title] => QueryListにQueryを追加したり、追加されたことを感知する | TIPS ...
|
||||
[link] => http://www.webcreativepark.net/angular/querylist_query_add_subscribe/
|
||||
)
|
||||
//...
|
||||
)
|
||||
```
|
||||
#### Encode convert
|
||||
```php
|
||||
// Out charset :UTF-8
|
||||
// In charset :GB2312
|
||||
QueryList::get('https://top.etao.com')->encoding('UTF-8','GB2312')->find('a')->texts();
|
||||
|
||||
// Out charset:UTF-8
|
||||
// In charset:Automatic Identification
|
||||
QueryList::get('https://top.etao.com')->encoding('UTF-8')->find('a')->texts();
|
||||
```
|
||||
|
||||
#### HTTP Client (GuzzleHttp)
|
||||
- Carry cookie login GitHub
|
||||
```php
|
||||
//Crawl GitHub content
|
||||
$ql = QueryList::get('https://github.com','param1=testvalue & params2=somevalue',[
|
||||
'headers' => [
|
||||
// Fill in the cookie from the browser
|
||||
'Cookie' => 'SINAGLOBAL=546064; wb_cmtLike_2112031=1; wvr=6;....'
|
||||
]
|
||||
]);
|
||||
//echo $ql->getHtml();
|
||||
$userName = $ql->find('.header-nav-current-user>.css-truncate-target')->text();
|
||||
echo $userName;
|
||||
```
|
||||
- Use the Http proxy
|
||||
```php
|
||||
$urlParams = ['param1' => 'testvalue','params2' => 'somevalue'];
|
||||
$opts = [
|
||||
// Set the http proxy
|
||||
'proxy' => 'http://222.141.11.17:8118',
|
||||
//Set the timeout time in seconds
|
||||
'timeout' => 30,
|
||||
// Fake HTTP headers
|
||||
'headers' => [
|
||||
'Referer' => 'https://querylist.cc/',
|
||||
'User-Agent' => 'testing/1.0',
|
||||
'Accept' => 'application/json',
|
||||
'X-Foo' => ['Bar', 'Baz'],
|
||||
'Cookie' => 'abc=111;xxx=222'
|
||||
]
|
||||
];
|
||||
$ql->get('http://httpbin.org/get',$urlParams,$opts);
|
||||
// echo $ql->getHtml();
|
||||
```
|
||||
|
||||
- Analog login
|
||||
```php
|
||||
// Post login
|
||||
$ql = QueryList::post('http://xxxx.com/login',[
|
||||
'username' => 'admin',
|
||||
'password' => '123456'
|
||||
])->get('http://xxx.com/admin');
|
||||
// Crawl pages that need to be logged in to access
|
||||
$ql->get('http://xxx.com/admin/page');
|
||||
//echo $ql->getHtml();
|
||||
```
|
||||
|
||||
#### Submit forms
|
||||
Login GitHub
|
||||
```php
|
||||
// Get the QueryList instance
|
||||
$ql = QueryList::getInstance();
|
||||
// Get the login form
|
||||
$form = $ql->get('https://github.com/login')->find('form');
|
||||
|
||||
// Fill in the GitHub username and password
|
||||
$form->find('input[name=login]')->val('your github username or email');
|
||||
$form->find('input[name=password]')->val('your github password');
|
||||
|
||||
// Serialize the form data
|
||||
$fromData = $form->serializeArray();
|
||||
$postData = [];
|
||||
foreach ($fromData as $item) {
|
||||
$postData[$item['name']] = $item['value'];
|
||||
}
|
||||
|
||||
// Submit the login form
|
||||
$actionUrl = 'https://github.com'.$form->attr('action');
|
||||
$ql->post($actionUrl,$postData);
|
||||
// To determine whether the login is successful
|
||||
// echo $ql->getHtml();
|
||||
$userName = $ql->find('.header-nav-current-user>.css-truncate-target')->text();
|
||||
if($userName)
|
||||
{
|
||||
echo 'Login successful ! Welcome:'.$userName;
|
||||
}else{
|
||||
echo 'Login failed !';
|
||||
}
|
||||
```
|
||||
#### Bind function extension
|
||||
Customize the extension of a `myHttp` method:
|
||||
```php
|
||||
$ql = QueryList::getInstance();
|
||||
|
||||
//Bind a `myHttp` method to the QueryList object
|
||||
$ql->bind('myHttp',function ($url){
|
||||
// $this is the current QueryList object
|
||||
$html = file_get_contents($url);
|
||||
$this->setHtml($html);
|
||||
return $this;
|
||||
});
|
||||
|
||||
// And then you can call by the name of the binding
|
||||
$data = $ql->myHttp('https://toutiao.io')->find('h3 a')->texts();
|
||||
print_r($data->all());
|
||||
```
|
||||
Or package to class, and then bind:
|
||||
```php
|
||||
$ql->bind('myHttp',function ($url){
|
||||
return new MyHttp($this,$url);
|
||||
});
|
||||
```
|
||||
|
||||
#### Plugin used
|
||||
- Use the PhantomJS plugin to crawl JavaScript dynamically rendered pages:
|
||||
|
||||
```php
|
||||
// Set the PhantomJS binary file path during installation
|
||||
$ql = QueryList::use(PhantomJs::class,'/usr/local/bin/phantomjs');
|
||||
|
||||
// Crawl「500px」all picture links
|
||||
$data = $ql->browser('https://500px.com/editors')->find('img')->attrs('src');
|
||||
print_r($data->all());
|
||||
|
||||
// Use the HTTP proxy
|
||||
$ql->browser('https://500px.com/editors',false,[
|
||||
'--proxy' => '192.168.1.42:8080',
|
||||
'--proxy-type' => 'http'
|
||||
])
|
||||
```
|
||||
|
||||
- Using the CURL multithreading plug-in, multi-threaded crawling GitHub trending :
|
||||
|
||||
```php
|
||||
$ql = QueryList::use(CurlMulti::class);
|
||||
$ql->curlMulti([
|
||||
'https://github.com/trending/php',
|
||||
'https://github.com/trending/go',
|
||||
//.....more urls
|
||||
])
|
||||
// Called if task is success
|
||||
->success(function (QueryList $ql,CurlMulti $curl,$r){
|
||||
echo "Current url:{$r['info']['url']} \r\n";
|
||||
$data = $ql->find('h3 a')->texts();
|
||||
print_r($data->all());
|
||||
})
|
||||
// Task fail callback
|
||||
->error(function ($errorInfo,CurlMulti $curl){
|
||||
echo "Current url:{$errorInfo['info']['url']} \r\n";
|
||||
print_r($errorInfo['error']);
|
||||
})
|
||||
->start([
|
||||
// Maximum number of threads
|
||||
'maxThread' => 10,
|
||||
// Number of error retries
|
||||
'maxTry' => 3,
|
||||
]);
|
||||
|
||||
```
|
||||
|
||||
## Plugins
|
||||
- [jae-jae/QueryList-PhantomJS](https://github.com/jae-jae/QueryList-PhantomJS):Use PhantomJS to crawl Javascript dynamically rendered page.
|
||||
- [jae-jae/QueryList-CurlMulti](https://github.com/jae-jae/QueryList-CurlMulti) : Curl multi threading.
|
||||
- [jae-jae/QueryList-AbsoluteUrl](https://github.com/jae-jae/QueryList-AbsoluteUrl) : Converting relative urls to absolute.
|
||||
- [jae-jae/QueryList-Rule-Google](https://github.com/jae-jae/QueryList-Rule-Google) : Google searcher.
|
||||
- [jae-jae/QueryList-Rule-Baidu](https://github.com/jae-jae/QueryList-Rule-Baidu) : Baidu searcher.
|
||||
|
||||
|
||||
View more QueryList plugins and QueryList-based products: [QueryList Community](https://github.com/jae-jae/QueryList-Community)
|
||||
|
||||
## Contributing
|
||||
Welcome to contribute code for the QueryList。About Contributing Plugins can be viewed:[QueryList Plugin Contributing Guide](https://github.com/jae-jae/QueryList-Community/blob/master/CONTRIBUTING.md)
|
||||
|
||||
## Author
|
||||
Jaeger <JaegerCode@gmail.com>
|
||||
|
||||
If this library is useful for you, say thanks [buying me a beer :beer:](https://www.paypal.me/jaepay)!
|
||||
|
||||
## Lisence
|
||||
QueryList is licensed under the license of MIT. See the LICENSE for more details.
|
||||
|
@@ -1,22 +1,40 @@
|
||||
{
|
||||
"name": "jaeger/querylist",
|
||||
"description": "QueryList是基于phpQuery的无比强大的PHP采集工具",
|
||||
"description": "Simple, elegant, extensible PHP Web Scraper (crawler/spider),Use the css3 dom selector,Based on phpQuery! 简洁、优雅、可扩展的PHP采集工具(爬虫),基于phpQuery。",
|
||||
"keywords":["QueryList","phpQuery","spider"],
|
||||
"homepage": "http://querylist.cc",
|
||||
"require": {
|
||||
"PHP":">=5.3.0",
|
||||
"jaeger/phpquery-single": "^0.9.5"
|
||||
"PHP":">=7.1",
|
||||
"jaeger/phpquery-single": "^1",
|
||||
"jaeger/g-http": "^1.1",
|
||||
"ext-dom": "*",
|
||||
"tightenco/collect": ">5.0"
|
||||
},
|
||||
"suggest":{
|
||||
|
||||
},
|
||||
"license": "MIT",
|
||||
"authors": [
|
||||
{
|
||||
"name": "Jaeger",
|
||||
"email": "hj.q@qq.com"
|
||||
"email": "JaegerCode@gmail.com"
|
||||
}
|
||||
],
|
||||
"autoload":{
|
||||
"psr-4":{
|
||||
"QL\\":""
|
||||
"QL\\":"src"
|
||||
}
|
||||
},
|
||||
"autoload-dev": {
|
||||
"psr-4": {
|
||||
"Tests\\": "tests/"
|
||||
}
|
||||
},
|
||||
"require-dev": {
|
||||
"symfony/var-dumper": "^3.3",
|
||||
"phpunit/phpunit": "^8.5"
|
||||
},
|
||||
"scripts": {
|
||||
"test": "./vendor/bin/phpunit"
|
||||
}
|
||||
}
|
||||
|
19
phpunit.xml
Normal file
19
phpunit.xml
Normal file
@@ -0,0 +1,19 @@
|
||||
<phpunit
|
||||
bootstrap="vendor/autoload.php"
|
||||
convertErrorsToExceptions="true"
|
||||
convertNoticesToExceptions="true"
|
||||
convertWarningsToExceptions="true"
|
||||
>
|
||||
<testsuites>
|
||||
<testsuite name="querylist">
|
||||
<directory>./tests</directory>
|
||||
</testsuite>
|
||||
</testsuites>
|
||||
|
||||
<filter>
|
||||
<whitelist>
|
||||
<directory suffix=".php">src</directory>
|
||||
</whitelist>
|
||||
</filter>
|
||||
|
||||
</phpunit>
|
94
src/Config.php
Normal file
94
src/Config.php
Normal file
@@ -0,0 +1,94 @@
|
||||
<?php
|
||||
/**
|
||||
* Created by PhpStorm.
|
||||
* User: Jaeger <JaegerCode@gmail.com>
|
||||
* Date: 2017/9/22
|
||||
*/
|
||||
|
||||
namespace QL;
|
||||
use Closure;
|
||||
use Tightenco\Collect\Support\Collection;
|
||||
|
||||
class Config
|
||||
{
|
||||
protected static $instance = null;
|
||||
|
||||
protected $plugins;
|
||||
protected $binds;
|
||||
|
||||
/**
|
||||
* Config constructor.
|
||||
*/
|
||||
public function __construct()
|
||||
{
|
||||
$this->plugins = new Collection();
|
||||
$this->binds = new Collection();
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Get the Config instance
|
||||
*
|
||||
* @return null|Config
|
||||
*/
|
||||
public static function getInstance()
|
||||
{
|
||||
self::$instance || self::$instance = new self();
|
||||
return self::$instance;
|
||||
}
|
||||
|
||||
/**
|
||||
* Global installation plugin
|
||||
*
|
||||
* @param $plugins
|
||||
* @param array ...$opt
|
||||
* @return $this
|
||||
*/
|
||||
public function use($plugins,...$opt)
|
||||
{
|
||||
if(is_string($plugins)){
|
||||
$this->plugins->push([$plugins,$opt]);
|
||||
}else{
|
||||
$this->plugins = $this->plugins->merge($plugins);
|
||||
}
|
||||
return $this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Global binding custom method
|
||||
*
|
||||
* @param string $name
|
||||
* @param Closure $provider
|
||||
* @return $this
|
||||
*/
|
||||
public function bind(string $name, Closure $provider)
|
||||
{
|
||||
$this->binds[$name] = $provider;
|
||||
return $this;
|
||||
}
|
||||
|
||||
public function bootstrap(QueryList $queryList)
|
||||
{
|
||||
$this->installPlugins($queryList);
|
||||
$this->installBind($queryList);
|
||||
}
|
||||
|
||||
protected function installPlugins(QueryList $queryList)
|
||||
{
|
||||
$this->plugins->each(function($plugin) use($queryList){
|
||||
if(is_string($plugin)){
|
||||
$queryList->use($plugin);
|
||||
}else{
|
||||
$queryList->use($plugin[0],...$plugin[1]);
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
protected function installBind(QueryList $queryList)
|
||||
{
|
||||
$this->binds->each(function ($provider,$name) use($queryList){
|
||||
$queryList->bind($name,$provider);
|
||||
});
|
||||
}
|
||||
|
||||
}
|
15
src/Contracts/PluginContract.php
Normal file
15
src/Contracts/PluginContract.php
Normal file
@@ -0,0 +1,15 @@
|
||||
<?php
|
||||
/**
|
||||
* Created by PhpStorm.
|
||||
* User: Jaeger <JaegerCode@gmail.com>
|
||||
* Date: 2017/9/22
|
||||
*/
|
||||
|
||||
namespace QL\Contracts;
|
||||
|
||||
use QL\QueryList;
|
||||
|
||||
interface PluginContract
|
||||
{
|
||||
public static function install(QueryList $queryList,...$opt);
|
||||
}
|
15
src/Contracts/ServiceProviderContract.php
Normal file
15
src/Contracts/ServiceProviderContract.php
Normal file
@@ -0,0 +1,15 @@
|
||||
<?php
|
||||
/**
|
||||
* Created by PhpStorm.
|
||||
* User: Jaeger <JaegerCode@gmail.com>
|
||||
* Date: 2017/9/20
|
||||
*/
|
||||
|
||||
namespace QL\Contracts;
|
||||
|
||||
use QL\Kernel;
|
||||
|
||||
interface ServiceProviderContract
|
||||
{
|
||||
public function register(Kernel $kernel);
|
||||
}
|
30
src/Dom/Dom.php
Normal file
30
src/Dom/Dom.php
Normal file
@@ -0,0 +1,30 @@
|
||||
<?php
|
||||
/**
|
||||
* Created by PhpStorm.
|
||||
* User: Jaeger <JaegerCode@gmail.com>
|
||||
* Date: 2017/9/19
|
||||
*/
|
||||
|
||||
namespace QL\Dom;
|
||||
|
||||
use phpQueryObject;
|
||||
|
||||
class Dom
|
||||
{
|
||||
|
||||
protected $document;
|
||||
|
||||
/**
|
||||
* Dom constructor.
|
||||
*/
|
||||
public function __construct(phpQueryObject $document)
|
||||
{
|
||||
$this->document = $document;
|
||||
}
|
||||
|
||||
public function find($selector)
|
||||
{
|
||||
$elements = $this->document->find($selector);
|
||||
return new Elements($elements);
|
||||
}
|
||||
}
|
260
src/Dom/Elements.php
Normal file
260
src/Dom/Elements.php
Normal file
@@ -0,0 +1,260 @@
|
||||
<?php
|
||||
/**
|
||||
* Created by PhpStorm.
|
||||
* User: Jaeger <JaegerCode@gmail.com>
|
||||
* Date: 2017/9/19
|
||||
*/
|
||||
|
||||
namespace QL\Dom;
|
||||
|
||||
use phpDocumentor\Reflection\Types\Null_;
|
||||
use phpQueryObject;
|
||||
use Tightenco\Collect\Support\Collection;
|
||||
|
||||
/**
|
||||
* Class Elements
|
||||
* @package QL\Dom
|
||||
*
|
||||
* @method Elements toReference(&$var)
|
||||
* @method Elements documentFragment($state = null)
|
||||
* @method Elements toRoot()
|
||||
* @method Elements getDocumentIDRef(&$documentID)
|
||||
* @method Elements getDocument()
|
||||
* @method \DOMDocument getDOMDocument()
|
||||
* @method Elements getDocumentID()
|
||||
* @method Elements unloadDocument()
|
||||
* @method bool isHTML()
|
||||
* @method bool isXHTML()
|
||||
* @method bool isXML()
|
||||
* @method string serialize()
|
||||
* @method array serializeArray($submit = null)
|
||||
* @method \DOMElement|\DOMElement[] get($index = null, $callback1 = null, $callback2 = null, $callback3 = null)
|
||||
* @method string|array getString($index = null, $callback1 = null, $callback2 = null, $callback3 = null)
|
||||
* @method string|array getStrings($index = null, $callback1 = null, $callback2 = null, $callback3 = null)
|
||||
* @method Elements newInstance($newStack = null)
|
||||
* @method Elements find($selectors, $context = null, $noHistory = false)
|
||||
* @method Elements|bool is($selector, $nodes = null)
|
||||
* @method Elements filterCallback($callback, $_skipHistory = false)
|
||||
* @method Elements filter($selectors, $_skipHistory = false)
|
||||
* @method Elements load($url, $data = null, $callback = null)
|
||||
* @method Elements trigger($type, $data = [])
|
||||
* @method Elements triggerHandler($type, $data = [])
|
||||
* @method Elements bind($type, $data, $callback = null)
|
||||
* @method Elements unbind($type = null, $callback = null)
|
||||
* @method Elements change($callback = null)
|
||||
* @method Elements submit($callback = null)
|
||||
* @method Elements click($callback = null)
|
||||
* @method Elements wrapAllOld($wrapper)
|
||||
* @method Elements wrapAll($wrapper)
|
||||
* @method Elements wrapAllPHP($codeBefore, $codeAfter)
|
||||
* @method Elements wrap($wrapper)
|
||||
* @method Elements wrapPHP($codeBefore, $codeAfter)
|
||||
* @method Elements wrapInner($wrapper)
|
||||
* @method Elements wrapInnerPHP($codeBefore, $codeAfter)
|
||||
* @method Elements contents()
|
||||
* @method Elements contentsUnwrap()
|
||||
* @method Elements switchWith($markup)
|
||||
* @method Elements eq($num)
|
||||
* @method Elements size()
|
||||
* @method Elements length()
|
||||
* @method int count()
|
||||
* @method Elements end($level = 1)
|
||||
* @method Elements _clone()
|
||||
* @method Elements replaceWithPHP($code)
|
||||
* @method Elements replaceWith($content)
|
||||
* @method Elements replaceAll($selector)
|
||||
* @method Elements remove($selector = null)
|
||||
* @method Elements|string markup($markup = null, $callback1 = null, $callback2 = null, $callback3 = null)
|
||||
* @method string markupOuter($callback1 = null, $callback2 = null, $callback3 = null)
|
||||
* @method Elements|string html($html = null, $callback1 = null, $callback2 = null, $callback3 = null)
|
||||
* @method Elements|string xml($xml = null, $callback1 = null, $callback2 = null, $callback3 = null)
|
||||
* @method string htmlOuter($callback1 = null, $callback2 = null, $callback3 = null)
|
||||
* @method string xmlOuter($callback1 = null, $callback2 = null, $callback3 = null)
|
||||
* @method Elements php($code)
|
||||
* @method string markupPHP($code)
|
||||
* @method string markupOuterPHP()
|
||||
* @method Elements children($selector)
|
||||
* @method Elements ancestors($selector)
|
||||
* @method Elements append($content)
|
||||
* @method Elements appendPHP($content)
|
||||
* @method Elements appendTo($seletor)
|
||||
* @method Elements prepend($content)
|
||||
* @method Elements prependPHP($content)
|
||||
* @method Elements prependTo($seletor)
|
||||
* @method Elements before($content)
|
||||
* @method Elements beforePHP($content)
|
||||
* @method Elements insertBefore($seletor)
|
||||
* @method Elements after($content)
|
||||
* @method Elements afterPHP($content)
|
||||
* @method Elements insertAfter($seletor)
|
||||
* @method Elements insert($target, $type)
|
||||
* @method int index($subject)
|
||||
* @method Elements slice($start, $end = null)
|
||||
* @method Elements reverse()
|
||||
* @method Elements|string text($text = null, $callback1 = null, $callback2 = null, $callback3 = null)
|
||||
* @method Elements plugin($class, $file = null)
|
||||
* @method Elements _next($selector = null)
|
||||
* @method Elements _prev($selector = null)
|
||||
* @method Elements prev($selector = null)
|
||||
* @method Elements prevAll($selector = null)
|
||||
* @method Elements nextAll($selector = null)
|
||||
* @method Elements siblings($selector = null)
|
||||
* @method Elements not($selector = null)
|
||||
* @method Elements add($selector = null)
|
||||
* @method Elements parent($selector = null)
|
||||
* @method Elements parents($selector = null)
|
||||
* @method Elements stack($nodeTypes = null)
|
||||
* @method Elements|string attr($attr = null, $value = null)
|
||||
* @method Elements attrPHP($attr, $code)
|
||||
* @method Elements removeAttr($attr)
|
||||
* @method Elements|string val($val = null)
|
||||
* @method Elements andSelf()
|
||||
* @method Elements addClass($className)
|
||||
* @method Elements addClassPHP($className)
|
||||
* @method bool hasClass($className)
|
||||
* @method Elements removeClass($className)
|
||||
* @method Elements toggleClass($className)
|
||||
* @method Elements _empty()
|
||||
* @method Elements callback($callback, $param1 = null, $param2 = null, $param3 = null)
|
||||
* @method string data($key, $value = null)
|
||||
* @method Elements removeData($key)
|
||||
* @method void rewind()
|
||||
* @method Elements current()
|
||||
* @method int key()
|
||||
* @method Elements next($cssSelector = null)
|
||||
* @method bool valid()
|
||||
* @method bool offsetExists($offset)
|
||||
* @method Elements offsetGet($offset)
|
||||
* @method void offsetSet($offset, $value)
|
||||
* @method string whois($oneNode)
|
||||
* @method Elements dump()
|
||||
* @method Elements dumpWhois()
|
||||
* @method Elements dumpLength()
|
||||
* @method Elements dumpTree($html, $title)
|
||||
* @method dumpDie()
|
||||
*/
|
||||
class Elements
|
||||
{
|
||||
/**
|
||||
* @var phpQueryObject
|
||||
*/
|
||||
protected $elements;
|
||||
|
||||
/**
|
||||
* Elements constructor.
|
||||
* @param $elements
|
||||
*/
|
||||
public function __construct(phpQueryObject $elements)
|
||||
{
|
||||
$this->elements = $elements;
|
||||
}
|
||||
|
||||
public function __get($name)
|
||||
{
|
||||
return property_exists($this->elements, $name) ? $this->elements->$name : $this->elements->attr($name);
|
||||
}
|
||||
|
||||
public function __call($name, $arguments)
|
||||
{
|
||||
$obj = call_user_func_array([$this->elements, $name], $arguments);
|
||||
if ($obj instanceof phpQueryObject) {
|
||||
$obj = new self($obj);
|
||||
} else if (is_string($obj)) {
|
||||
$obj = trim($obj);
|
||||
}
|
||||
return $obj;
|
||||
}
|
||||
|
||||
/**
|
||||
* Iterating elements
|
||||
*
|
||||
* @param callable $callback
|
||||
*
|
||||
* @return $this
|
||||
*/
|
||||
public function each(callable $callback)
|
||||
{
|
||||
foreach ($this->elements as $key => $element) {
|
||||
$break = $callback(new self(pq($element)), $key);
|
||||
if ($break === false) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return $this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Iterating elements
|
||||
*
|
||||
* @param $callback
|
||||
* @return \Illuminate\Support\Collection|\Tightenco\Collect\Support\Collection
|
||||
*/
|
||||
public function map($callback)
|
||||
{
|
||||
$collection = new Collection();
|
||||
$this->elements->each(function ($dom) use (& $collection, $callback) {
|
||||
$collection->push($callback(new self(pq($dom))));
|
||||
});
|
||||
return $collection;
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets the attributes of all the elements
|
||||
*
|
||||
* @param string $attr HTML attribute name
|
||||
* @return \Illuminate\Support\Collection|\Tightenco\Collect\Support\Collection
|
||||
*/
|
||||
public function attrs($attr)
|
||||
{
|
||||
return $this->map(function ($item) use ($attr) {
|
||||
return $item->attr($attr);
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets the text of all the elements
|
||||
*
|
||||
* @return \Illuminate\Support\Collection|\Tightenco\Collect\Support\Collection
|
||||
*/
|
||||
public function texts()
|
||||
{
|
||||
return $this->map(function ($item) {
|
||||
return trim($item->text());
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets the html of all the elements
|
||||
*
|
||||
* @return \Illuminate\Support\Collection|\Tightenco\Collect\Support\Collection
|
||||
*/
|
||||
public function htmls()
|
||||
{
|
||||
return $this->map(function ($item) {
|
||||
return trim($item->html());
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets the htmlOuter of all the elements
|
||||
*
|
||||
* @return \Illuminate\Support\Collection|\Tightenco\Collect\Support\Collection
|
||||
*/
|
||||
public function htmlOuters()
|
||||
{
|
||||
return $this->map(function ($item) {
|
||||
return trim($item->htmlOuter());
|
||||
});
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* @return phpQueryObject
|
||||
*/
|
||||
public function getElements(): phpQueryObject
|
||||
{
|
||||
return $this->elements;
|
||||
}
|
||||
|
||||
}
|
322
src/Dom/Query.php
Normal file
322
src/Dom/Query.php
Normal file
@@ -0,0 +1,322 @@
|
||||
<?php
|
||||
/**
|
||||
* Created by PhpStorm.
|
||||
* User: Jaeger <JaegerCode@gmail.com>
|
||||
* Date: 2017/9/21
|
||||
*/
|
||||
|
||||
namespace QL\Dom;
|
||||
|
||||
use Tightenco\Collect\Support\Collection;
|
||||
use phpQuery;
|
||||
use phpQueryObject;
|
||||
use QL\QueryList;
|
||||
use Closure;
|
||||
|
||||
class Query
|
||||
{
|
||||
protected $html;
|
||||
/**
|
||||
* @var \phpQueryObject
|
||||
*/
|
||||
protected $document;
|
||||
protected $rules;
|
||||
protected $range = null;
|
||||
protected $ql;
|
||||
/**
|
||||
* @var Collection
|
||||
*/
|
||||
protected $data;
|
||||
|
||||
|
||||
public function __construct(QueryList $ql)
|
||||
{
|
||||
$this->ql = $ql;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param bool $rel
|
||||
* @return String
|
||||
*/
|
||||
public function getHtml($rel = true)
|
||||
{
|
||||
return $rel ? $this->document->htmlOuter() : $this->html;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param $html
|
||||
* @param null $charset
|
||||
* @return QueryList
|
||||
*/
|
||||
public function setHtml($html, $charset = null)
|
||||
{
|
||||
$this->html = value($html);
|
||||
$this->destroyDocument();
|
||||
$this->document = phpQuery::newDocumentHTML($this->html, $charset);
|
||||
return $this->ql;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get crawl results
|
||||
*
|
||||
* @param Closure|null $callback
|
||||
* @return Collection|static
|
||||
*/
|
||||
public function getData(Closure $callback = null)
|
||||
{
|
||||
return $this->handleData($this->data, $callback);
|
||||
}
|
||||
|
||||
/**
|
||||
* @param Collection $data
|
||||
*/
|
||||
public function setData(Collection $data)
|
||||
{
|
||||
$this->data = $data;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Searches for all elements that match the specified expression.
|
||||
*
|
||||
* @param $selector A string containing a selector expression to match elements against.
|
||||
* @return Elements
|
||||
*/
|
||||
public function find($selector)
|
||||
{
|
||||
return (new Dom($this->document))->find($selector);
|
||||
}
|
||||
|
||||
/**
|
||||
* Set crawl rule
|
||||
*
|
||||
* $rules = [
|
||||
* 'rule_name1' => ['selector','HTML attribute | text | html','Tag filter list','callback'],
|
||||
* 'rule_name2' => ['selector','HTML attribute | text | html','Tag filter list','callback'],
|
||||
* // ...
|
||||
* ]
|
||||
*
|
||||
* @param array $rules
|
||||
* @return QueryList
|
||||
*/
|
||||
public function rules(array $rules)
|
||||
{
|
||||
$this->rules = $rules;
|
||||
return $this->ql;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Set the slice area for crawl list
|
||||
*
|
||||
* @param $selector
|
||||
* @return QueryList
|
||||
*/
|
||||
public function range($selector)
|
||||
{
|
||||
$this->range = $selector;
|
||||
return $this->ql;
|
||||
}
|
||||
|
||||
/**
|
||||
* Remove HTML head,try to solve the garbled
|
||||
*
|
||||
* @return QueryList
|
||||
*/
|
||||
public function removeHead()
|
||||
{
|
||||
$html = preg_replace('/(<head>|<head\s+.+?>).+?<\/head>/is', '<head></head>', $this->html);
|
||||
$html && $this->setHtml($html);
|
||||
return $this->ql;
|
||||
}
|
||||
|
||||
/**
|
||||
* Execute the query rule
|
||||
*
|
||||
* @param Closure|null $callback
|
||||
* @return QueryList
|
||||
*/
|
||||
public function query(Closure $callback = null)
|
||||
{
|
||||
$this->data = $this->getList();
|
||||
$this->data = $this->handleData($this->data, $callback);
|
||||
return $this->ql;
|
||||
}
|
||||
|
||||
public function handleData(Collection $data, $callback)
|
||||
{
|
||||
if (is_callable($callback)) {
|
||||
if (empty($this->range)) {
|
||||
$data = new Collection($callback($data->all(), null));
|
||||
} else {
|
||||
$data = $data->map($callback);
|
||||
}
|
||||
}
|
||||
|
||||
return $data;
|
||||
}
|
||||
|
||||
protected function getList()
|
||||
{
|
||||
$data = [];
|
||||
if (empty($this->range)) {
|
||||
foreach ($this->rules as $key => $reg_value) {
|
||||
$rule = $this->parseRule($reg_value);
|
||||
$contentElements = $this->document->find($rule['selector']);
|
||||
$data[$key] = $this->extractContent($contentElements, $key, $rule);
|
||||
}
|
||||
} else {
|
||||
$rangeElements = $this->document->find($this->range);
|
||||
$i = 0;
|
||||
foreach ($rangeElements as $element) {
|
||||
foreach ($this->rules as $key => $reg_value) {
|
||||
$rule = $this->parseRule($reg_value);
|
||||
$contentElements = pq($element)->find($rule['selector']);
|
||||
$data[$i][$key] = $this->extractContent($contentElements, $key, $rule);
|
||||
}
|
||||
$i++;
|
||||
}
|
||||
}
|
||||
|
||||
return new Collection($data);
|
||||
}
|
||||
|
||||
protected function extractContent(phpQueryObject $pqObj, $ruleName, $rule)
|
||||
{
|
||||
switch ($rule['attr']) {
|
||||
case 'text':
|
||||
$content = $this->allowTags($pqObj->html(), $rule['filter_tags']);
|
||||
break;
|
||||
case 'texts':
|
||||
$content = (new Elements($pqObj))->map(function (Elements $element) use ($rule) {
|
||||
return $this->allowTags($element->html(), $rule['filter_tags']);
|
||||
})->all();
|
||||
break;
|
||||
case 'html':
|
||||
$content = $this->stripTags($pqObj->html(), $rule['filter_tags']);
|
||||
break;
|
||||
case 'htmls':
|
||||
$content = (new Elements($pqObj))->map(function (Elements $element) use ($rule) {
|
||||
return $this->stripTags($element->html(), $rule['filter_tags']);
|
||||
})->all();
|
||||
break;
|
||||
case 'htmlOuter':
|
||||
$content = $this->stripTags($pqObj->htmlOuter(), $rule['filter_tags']);
|
||||
break;
|
||||
case 'htmlOuters':
|
||||
$content = (new Elements($pqObj))->map(function (Elements $element) use ($rule) {
|
||||
return $this->stripTags($element->htmlOuter(), $rule['filter_tags']);
|
||||
})->all();
|
||||
break;
|
||||
default:
|
||||
if(preg_match('/attr\((.+)\)/', $rule['attr'], $arr)) {
|
||||
$content = $pqObj->attr($arr[1]);
|
||||
} elseif (preg_match('/attrs\((.+)\)/', $rule['attr'], $arr)) {
|
||||
$content = (new Elements($pqObj))->attrs($arr[1])->all();
|
||||
} else {
|
||||
$content = $pqObj->attr($rule['attr']);
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
if (is_callable($rule['handle_callback'])) {
|
||||
$content = call_user_func($rule['handle_callback'], $content, $ruleName);
|
||||
}
|
||||
|
||||
return $content;
|
||||
}
|
||||
|
||||
protected function parseRule($rule)
|
||||
{
|
||||
$result = [];
|
||||
$result['selector'] = $rule[0];
|
||||
$result['attr'] = $rule[1];
|
||||
$result['filter_tags'] = $rule[2] ?? '';
|
||||
$result['handle_callback'] = $rule[3] ?? null;
|
||||
|
||||
return $result;
|
||||
}
|
||||
|
||||
/**
|
||||
* 去除特定的html标签
|
||||
* @param string $html
|
||||
* @param string $tags_str 多个标签名之间用空格隔开
|
||||
* @return string
|
||||
*/
|
||||
protected function stripTags($html, $tags_str)
|
||||
{
|
||||
$tagsArr = $this->tag($tags_str);
|
||||
$html = $this->removeTags($html, $tagsArr[1]);
|
||||
$p = array();
|
||||
foreach ($tagsArr[0] as $tag) {
|
||||
$p[] = "/(<(?:\/" . $tag . "|" . $tag . ")[^>]*>)/i";
|
||||
}
|
||||
$html = preg_replace($p, "", trim($html));
|
||||
return $html;
|
||||
}
|
||||
|
||||
/**
|
||||
* 保留特定的html标签
|
||||
* @param string $html
|
||||
* @param string $tags_str 多个标签名之间用空格隔开
|
||||
* @return string
|
||||
*/
|
||||
protected function allowTags($html, $tags_str)
|
||||
{
|
||||
$tagsArr = $this->tag($tags_str);
|
||||
$html = $this->removeTags($html, $tagsArr[1]);
|
||||
$allow = '';
|
||||
foreach ($tagsArr[0] as $tag) {
|
||||
$allow .= "<$tag> ";
|
||||
}
|
||||
return strip_tags(trim($html), $allow);
|
||||
}
|
||||
|
||||
protected function tag($tags_str)
|
||||
{
|
||||
$tagArr = preg_split("/\s+/", $tags_str, -1, PREG_SPLIT_NO_EMPTY);
|
||||
$tags = array(array(), array());
|
||||
foreach ($tagArr as $tag) {
|
||||
if (preg_match('/-(.+)/', $tag, $arr)) {
|
||||
array_push($tags[1], $arr[1]);
|
||||
} else {
|
||||
array_push($tags[0], $tag);
|
||||
}
|
||||
}
|
||||
return $tags;
|
||||
}
|
||||
|
||||
/**
|
||||
* 移除特定的html标签
|
||||
* @param string $html
|
||||
* @param array $tags 标签数组
|
||||
* @return string
|
||||
*/
|
||||
protected function removeTags($html, $tags)
|
||||
{
|
||||
$tag_str = '';
|
||||
if (count($tags)) {
|
||||
foreach ($tags as $tag) {
|
||||
$tag_str .= $tag_str ? ',' . $tag : $tag;
|
||||
}
|
||||
// phpQuery::$defaultCharset = $this->inputEncoding?$this->inputEncoding:$this->htmlEncoding;
|
||||
$doc = phpQuery::newDocumentHTML($html);
|
||||
pq($doc)->find($tag_str)->remove();
|
||||
$html = pq($doc)->htmlOuter();
|
||||
$doc->unloadDocument();
|
||||
}
|
||||
return $html;
|
||||
}
|
||||
|
||||
protected function destroyDocument()
|
||||
{
|
||||
if ($this->document instanceof phpQueryObject) {
|
||||
$this->document->unloadDocument();
|
||||
}
|
||||
}
|
||||
|
||||
public function __destruct()
|
||||
{
|
||||
$this->destroyDocument();
|
||||
}
|
||||
}
|
15
src/Exceptions/ServiceNotFoundException.php
Normal file
15
src/Exceptions/ServiceNotFoundException.php
Normal file
@@ -0,0 +1,15 @@
|
||||
<?php
|
||||
/**
|
||||
* Created by PhpStorm.
|
||||
* User: Jaeger <JaegerCode@gmail.com>
|
||||
* Date: 2017/9/21
|
||||
*/
|
||||
|
||||
namespace QL\Exceptions;
|
||||
|
||||
use Exception;
|
||||
|
||||
class ServiceNotFoundException extends Exception
|
||||
{
|
||||
|
||||
}
|
74
src/Kernel.php
Normal file
74
src/Kernel.php
Normal file
@@ -0,0 +1,74 @@
|
||||
<?php
|
||||
/**
|
||||
* Created by PhpStorm.
|
||||
* User: Jaeger <JaegerCode@gmail.com>
|
||||
* Date: 2017/9/21
|
||||
*/
|
||||
|
||||
namespace QL;
|
||||
|
||||
use QL\Contracts\ServiceProviderContract;
|
||||
use QL\Exceptions\ServiceNotFoundException;
|
||||
use QL\Providers\EncodeServiceProvider;
|
||||
use Closure;
|
||||
use QL\Providers\HttpServiceProvider;
|
||||
use QL\Providers\PluginServiceProvider;
|
||||
use QL\Providers\SystemServiceProvider;
|
||||
use Tightenco\Collect\Support\Collection;
|
||||
|
||||
class Kernel
|
||||
{
|
||||
protected $providers = [
|
||||
SystemServiceProvider::class,
|
||||
HttpServiceProvider::class,
|
||||
EncodeServiceProvider::class,
|
||||
PluginServiceProvider::class
|
||||
];
|
||||
|
||||
protected $binds;
|
||||
protected $ql;
|
||||
|
||||
/**
|
||||
* Kernel constructor.
|
||||
* @param $ql
|
||||
*/
|
||||
public function __construct(QueryList $ql)
|
||||
{
|
||||
$this->ql = $ql;
|
||||
$this->binds = new Collection();
|
||||
}
|
||||
|
||||
public function bootstrap()
|
||||
{
|
||||
//注册服务提供者
|
||||
$this->registerProviders();
|
||||
return $this;
|
||||
}
|
||||
|
||||
public function registerProviders()
|
||||
{
|
||||
foreach ($this->providers as $provider) {
|
||||
$this->register(new $provider());
|
||||
}
|
||||
}
|
||||
|
||||
public function bind(string $name,Closure $provider)
|
||||
{
|
||||
$this->binds[$name] = $provider;
|
||||
}
|
||||
|
||||
public function getService(string $name)
|
||||
{
|
||||
if(!$this->binds->offsetExists($name)){
|
||||
throw new ServiceNotFoundException("Service: {$name} not found!");
|
||||
}
|
||||
return $this->binds[$name];
|
||||
}
|
||||
|
||||
private function register(ServiceProviderContract $instance)
|
||||
{
|
||||
$instance->register($this);
|
||||
}
|
||||
|
||||
|
||||
}
|
22
src/Providers/EncodeServiceProvider.php
Normal file
22
src/Providers/EncodeServiceProvider.php
Normal file
@@ -0,0 +1,22 @@
|
||||
<?php
|
||||
/**
|
||||
* Created by PhpStorm.
|
||||
* User: Jaeger <JaegerCode@gmail.com>
|
||||
* Date: 2017/9/20
|
||||
*/
|
||||
|
||||
namespace QL\Providers;
|
||||
|
||||
use QL\Contracts\ServiceProviderContract;
|
||||
use QL\Kernel;
|
||||
use QL\Services\EncodeService;
|
||||
|
||||
class EncodeServiceProvider implements ServiceProviderContract
|
||||
{
|
||||
public function register(Kernel $kernel)
|
||||
{
|
||||
$kernel->bind('encoding',function (string $outputEncoding,string $inputEncoding = null){
|
||||
return EncodeService::convert($this,$outputEncoding,$inputEncoding);
|
||||
});
|
||||
}
|
||||
}
|
40
src/Providers/HttpServiceProvider.php
Normal file
40
src/Providers/HttpServiceProvider.php
Normal file
@@ -0,0 +1,40 @@
|
||||
<?php
|
||||
/**
|
||||
* Created by PhpStorm.
|
||||
* User: Jaeger <JaegerCode@gmail.com>
|
||||
* Date: 2017/9/22
|
||||
*/
|
||||
|
||||
namespace QL\Providers;
|
||||
|
||||
|
||||
use QL\Contracts\ServiceProviderContract;
|
||||
use QL\Kernel;
|
||||
use QL\Services\HttpService;
|
||||
use QL\Services\MultiRequestService;
|
||||
|
||||
class HttpServiceProvider implements ServiceProviderContract
|
||||
{
|
||||
public function register(Kernel $kernel)
|
||||
{
|
||||
$kernel->bind('get',function (...$args){
|
||||
return HttpService::get($this,...$args);
|
||||
});
|
||||
|
||||
$kernel->bind('post',function (...$args){
|
||||
return HttpService::post($this,...$args);
|
||||
});
|
||||
|
||||
$kernel->bind('postJson',function (...$args){
|
||||
return HttpService::postJson($this,...$args);
|
||||
});
|
||||
|
||||
$kernel->bind('multiGet',function (...$args){
|
||||
return new MultiRequestService($this,'get',...$args);
|
||||
});
|
||||
|
||||
$kernel->bind('multiPost',function (...$args){
|
||||
return new MultiRequestService($this,'post',...$args);
|
||||
});
|
||||
}
|
||||
}
|
23
src/Providers/PluginServiceProvider.php
Normal file
23
src/Providers/PluginServiceProvider.php
Normal file
@@ -0,0 +1,23 @@
|
||||
<?php
|
||||
/**
|
||||
* Created by PhpStorm.
|
||||
* User: Jaeger <JaegerCode@gmail.com>
|
||||
* Date: 2017/9/22
|
||||
*/
|
||||
|
||||
namespace QL\Providers;
|
||||
|
||||
use QL\Contracts\ServiceProviderContract;
|
||||
use QL\Kernel;
|
||||
use QL\Services\PluginService;
|
||||
|
||||
class PluginServiceProvider implements ServiceProviderContract
|
||||
{
|
||||
public function register(Kernel $kernel)
|
||||
{
|
||||
$kernel->bind('use',function ($plugins,...$opt){
|
||||
return PluginService::install($this,$plugins,...$opt);
|
||||
});
|
||||
}
|
||||
|
||||
}
|
32
src/Providers/SystemServiceProvider.php
Normal file
32
src/Providers/SystemServiceProvider.php
Normal file
@@ -0,0 +1,32 @@
|
||||
<?php
|
||||
/**
|
||||
* Created by PhpStorm.
|
||||
* User: Jaeger <JaegerCode@gmail.com>
|
||||
* Date: 2017/9/22
|
||||
*/
|
||||
|
||||
namespace QL\Providers;
|
||||
|
||||
use QL\Contracts\ServiceProviderContract;
|
||||
use QL\Kernel;
|
||||
use Closure;
|
||||
|
||||
class SystemServiceProvider implements ServiceProviderContract
|
||||
{
|
||||
public function register(Kernel $kernel)
|
||||
{
|
||||
$kernel->bind('html',function (...$args){
|
||||
$this->setHtml(...$args);
|
||||
return $this;
|
||||
});
|
||||
|
||||
$kernel->bind('queryData',function (Closure $callback = null){
|
||||
return $this->query()->getData($callback)->all();
|
||||
});
|
||||
|
||||
$kernel->bind('pipe',function (Closure $callback = null){
|
||||
return $callback($this);
|
||||
});
|
||||
|
||||
}
|
||||
}
|
133
src/QueryList.php
Normal file
133
src/QueryList.php
Normal file
@@ -0,0 +1,133 @@
|
||||
<?php
|
||||
/**
|
||||
* QueryList
|
||||
*
|
||||
* 一个基于phpQuery的通用列表采集类
|
||||
*
|
||||
* @author Jaeger
|
||||
* @email JaegerCode@gmail.com
|
||||
* @link https://github.com/jae-jae/QueryList
|
||||
* @version 4.0.0
|
||||
*
|
||||
*/
|
||||
|
||||
namespace QL;
|
||||
use phpQuery;
|
||||
use QL\Dom\Query;
|
||||
use Tightenco\Collect\Support\Collection;
|
||||
use Closure;
|
||||
use QL\Services\MultiRequestService;
|
||||
|
||||
|
||||
/**
|
||||
* Class QueryList
|
||||
* @package QL
|
||||
*
|
||||
* @method string getHtml($rel = true)
|
||||
* @method QueryList setHtml($html)
|
||||
* @method QueryList html($html)
|
||||
* @method Dom\Elements find($selector)
|
||||
* @method QueryList rules(array $rules)
|
||||
* @method QueryList range($range)
|
||||
* @method QueryList removeHead()
|
||||
* @method QueryList query(Closure $callback = null)
|
||||
* @method Collection getData(Closure $callback = null)
|
||||
* @method Array queryData(Closure $callback = null)
|
||||
* @method QueryList setData(Collection $data)
|
||||
* @method QueryList encoding(string $outputEncoding,string $inputEncoding = null)
|
||||
* @method QueryList get($url,$args = null,$otherArgs = [])
|
||||
* @method QueryList post($url,$args = null,$otherArgs = [])
|
||||
* @method QueryList postJson($url,$args = null,$otherArgs = [])
|
||||
* @method MultiRequestService multiGet($urls)
|
||||
* @method MultiRequestService multiPost($urls)
|
||||
* @method QueryList use($plugins,...$opt)
|
||||
* @method QueryList pipe(Closure $callback = null)
|
||||
*/
|
||||
class QueryList
|
||||
{
|
||||
protected $query;
|
||||
protected $kernel;
|
||||
protected static $instance = null;
|
||||
|
||||
/**
|
||||
* QueryList constructor.
|
||||
*/
|
||||
public function __construct()
|
||||
{
|
||||
$this->query = new Query($this);
|
||||
$this->kernel = (new Kernel($this))->bootstrap();
|
||||
Config::getInstance()->bootstrap($this);
|
||||
}
|
||||
|
||||
public function __call($name, $arguments)
|
||||
{
|
||||
if(method_exists($this->query,$name)){
|
||||
$result = $this->query->$name(...$arguments);
|
||||
}else{
|
||||
$result = $this->kernel->getService($name)->call($this,...$arguments);
|
||||
}
|
||||
return $result;
|
||||
}
|
||||
|
||||
public static function __callStatic($name, $arguments)
|
||||
{
|
||||
$instance = new self();
|
||||
return $instance->$name(...$arguments);
|
||||
}
|
||||
|
||||
public function __destruct()
|
||||
{
|
||||
$this->destruct();
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the QueryList single instance
|
||||
*
|
||||
* @return QueryList
|
||||
*/
|
||||
public static function getInstance()
|
||||
{
|
||||
self::$instance || self::$instance = new self();
|
||||
return self::$instance;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the Config instance
|
||||
* @return null|Config
|
||||
*/
|
||||
public static function config()
|
||||
{
|
||||
return Config::getInstance();
|
||||
}
|
||||
|
||||
/**
|
||||
* Destruction of resources
|
||||
*/
|
||||
public function destruct()
|
||||
{
|
||||
unset($this->query);
|
||||
unset($this->kernel);
|
||||
}
|
||||
|
||||
/**
|
||||
* Destroy all documents
|
||||
*/
|
||||
public static function destructDocuments()
|
||||
{
|
||||
phpQuery::$documents = [];
|
||||
}
|
||||
|
||||
/**
|
||||
* Bind a custom method to the QueryList object
|
||||
*
|
||||
* @param string $name Invoking the name
|
||||
* @param Closure $provide Called method
|
||||
* @return $this
|
||||
*/
|
||||
public function bind(string $name,Closure $provide)
|
||||
{
|
||||
$this->kernel->bind($name,$provide);
|
||||
return $this;
|
||||
}
|
||||
|
||||
}
|
37
src/Services/EncodeService.php
Normal file
37
src/Services/EncodeService.php
Normal file
@@ -0,0 +1,37 @@
|
||||
<?php
|
||||
/**
|
||||
* Created by PhpStorm.
|
||||
* User: Jaeger <JaegerCode@gmail.com>
|
||||
* Date: 2017/9/20
|
||||
* 编码转换服务
|
||||
*/
|
||||
|
||||
namespace QL\Services;
|
||||
|
||||
use QL\QueryList;
|
||||
|
||||
class EncodeService
|
||||
{
|
||||
public static function convert(QueryList $ql,string $outputEncoding,string $inputEncoding = null)
|
||||
{
|
||||
$html = $ql->getHtml();
|
||||
$inputEncoding || $inputEncoding = self::detect($html);
|
||||
$html = iconv($inputEncoding,$outputEncoding.'//IGNORE',$html);
|
||||
$ql->setHtml($html);
|
||||
return $ql;
|
||||
}
|
||||
|
||||
/**
|
||||
* Attempts to detect the encoding
|
||||
* @param $string
|
||||
* @return bool|false|mixed|string
|
||||
*/
|
||||
public static function detect($string)
|
||||
{
|
||||
$charset=mb_detect_encoding($string, array('ASCII', 'GB2312', 'GBK', 'UTF-8'),true);
|
||||
if(strtolower($charset)=='cp936')
|
||||
$charset='GBK';
|
||||
return $charset;
|
||||
}
|
||||
|
||||
}
|
59
src/Services/HttpService.php
Normal file
59
src/Services/HttpService.php
Normal file
@@ -0,0 +1,59 @@
|
||||
<?php
|
||||
/**
|
||||
* Created by PhpStorm.
|
||||
* User: Jaeger <JaegerCode@gmail.com>
|
||||
* Date: 2017/9/22
|
||||
*/
|
||||
|
||||
namespace QL\Services;
|
||||
|
||||
use GuzzleHttp\Cookie\CookieJar;
|
||||
use Jaeger\GHttp;
|
||||
use QL\QueryList;
|
||||
|
||||
class HttpService
|
||||
{
|
||||
protected static $cookieJar = null;
|
||||
|
||||
public static function getCookieJar()
|
||||
{
|
||||
if(self::$cookieJar == null)
|
||||
{
|
||||
self::$cookieJar = new CookieJar();
|
||||
}
|
||||
return self::$cookieJar;
|
||||
}
|
||||
|
||||
public static function get(QueryList $ql,$url,$args = null,$otherArgs = [])
|
||||
{
|
||||
$otherArgs = array_merge([
|
||||
'cookies' => self::getCookieJar(),
|
||||
'verify' => false
|
||||
],$otherArgs);
|
||||
$html = GHttp::get($url,$args,$otherArgs);
|
||||
$ql->setHtml($html);
|
||||
return $ql;
|
||||
}
|
||||
|
||||
public static function post(QueryList $ql,$url,$args = null,$otherArgs = [])
|
||||
{
|
||||
$otherArgs = array_merge([
|
||||
'cookies' => self::getCookieJar(),
|
||||
'verify' => false
|
||||
],$otherArgs);
|
||||
$html = GHttp::post($url,$args,$otherArgs);
|
||||
$ql->setHtml($html);
|
||||
return $ql;
|
||||
}
|
||||
|
||||
public static function postJson(QueryList $ql,$url,$args = null,$otherArgs = [])
|
||||
{
|
||||
$otherArgs = array_merge([
|
||||
'cookies' => self::getCookieJar(),
|
||||
'verify' => false
|
||||
],$otherArgs);
|
||||
$html = GHttp::postJson($url,$args,$otherArgs);
|
||||
$ql->setHtml($html);
|
||||
return $ql;
|
||||
}
|
||||
}
|
66
src/Services/MultiRequestService.php
Normal file
66
src/Services/MultiRequestService.php
Normal file
@@ -0,0 +1,66 @@
|
||||
<?php
|
||||
/**
|
||||
* Created by PhpStorm.
|
||||
* User: Jaeger <JaegerCode@gmail.com>
|
||||
* Date: 18/12/10
|
||||
* Time: 下午7:05
|
||||
*/
|
||||
|
||||
namespace QL\Services;
|
||||
|
||||
|
||||
use Jaeger\GHttp;
|
||||
use Closure;
|
||||
use GuzzleHttp\Psr7\Response;
|
||||
use QL\QueryList;
|
||||
use GuzzleHttp\Exception\RequestException;
|
||||
|
||||
/**
|
||||
* Class MultiRequestService
|
||||
* @package QL\Services
|
||||
*
|
||||
* @method MultiRequestService withHeaders($headers)
|
||||
* @method MultiRequestService withOptions($options)
|
||||
* @method MultiRequestService concurrency($concurrency)
|
||||
*/
|
||||
class MultiRequestService
|
||||
{
|
||||
protected $ql;
|
||||
protected $multiRequest;
|
||||
protected $method;
|
||||
|
||||
public function __construct(QueryList $ql,$method,$urls)
|
||||
{
|
||||
$this->ql = $ql;
|
||||
$this->method = $method;
|
||||
$this->multiRequest = GHttp::multiRequest($urls);
|
||||
}
|
||||
|
||||
public function __call($name, $arguments)
|
||||
{
|
||||
$this->multiRequest = $this->multiRequest->$name(...$arguments);
|
||||
return $this;
|
||||
}
|
||||
|
||||
public function success(Closure $success)
|
||||
{
|
||||
$this->multiRequest = $this->multiRequest->success(function(Response $response, $index) use($success){
|
||||
$this->ql->setHtml((String)$response->getBody());
|
||||
$success($this->ql,$response, $index);
|
||||
});
|
||||
return $this;
|
||||
}
|
||||
|
||||
public function error(Closure $error)
|
||||
{
|
||||
$this->multiRequest = $this->multiRequest->error(function(RequestException $reason, $index) use($error){
|
||||
$error($this->ql,$reason, $index);
|
||||
});
|
||||
return $this;
|
||||
}
|
||||
|
||||
public function send()
|
||||
{
|
||||
$this->multiRequest->{$this->method}();
|
||||
}
|
||||
}
|
26
src/Services/PluginService.php
Normal file
26
src/Services/PluginService.php
Normal file
@@ -0,0 +1,26 @@
|
||||
<?php
|
||||
/**
|
||||
* Created by PhpStorm.
|
||||
* User: Jaeger <JaegerCode@gmail.com>
|
||||
* Date: 2017/9/22
|
||||
*/
|
||||
|
||||
namespace QL\Services;
|
||||
|
||||
use QL\QueryList;
|
||||
|
||||
class PluginService
|
||||
{
|
||||
public static function install(QueryList $queryList, $plugins, ...$opt)
|
||||
{
|
||||
if(is_array($plugins))
|
||||
{
|
||||
foreach ($plugins as $plugin) {
|
||||
$plugin::install($queryList);
|
||||
}
|
||||
}else{
|
||||
$plugins::install($queryList,...$opt);
|
||||
}
|
||||
return $queryList;
|
||||
}
|
||||
}
|
71
tests/Dom/FindTest.php
Normal file
71
tests/Dom/FindTest.php
Normal file
@@ -0,0 +1,71 @@
|
||||
<?php
|
||||
/**
|
||||
* Created by PhpStorm.
|
||||
* User: x
|
||||
* Date: 2018/12/10
|
||||
* Time: 12:46 AM
|
||||
*/
|
||||
|
||||
namespace Tests\Dom;
|
||||
|
||||
|
||||
use QL\QueryList;
|
||||
use Tests\TestCaseBase;
|
||||
|
||||
class FindTest extends TestCaseBase
|
||||
{
|
||||
protected $html;
|
||||
protected $ql;
|
||||
|
||||
protected function setUp(): void
|
||||
{
|
||||
$this->html = $this->getSnippet('snippet-1');
|
||||
$this->ql = QueryList::html($this->html);
|
||||
}
|
||||
|
||||
/**
|
||||
* @test
|
||||
*/
|
||||
public function find_first_dom_attr()
|
||||
{
|
||||
$img = [];
|
||||
$img[] = $this->ql->find('img')->attr('src');
|
||||
$img[] = $this->ql->find('img')->src;
|
||||
$img[] = $this->ql->find('img:eq(0)')->src;
|
||||
$img[] = $this->ql->find('img')->eq(0)->src;
|
||||
|
||||
$alt = $this->ql->find('img')->alt;
|
||||
$abc = $this->ql->find('img')->abc;
|
||||
|
||||
$this->assertCount(1,array_unique($img));
|
||||
$this->assertEquals($alt,'这是图片');
|
||||
$this->assertEquals($abc,'这是一个自定义属性');
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* @test
|
||||
*/
|
||||
public function find_second_dom_attr()
|
||||
{
|
||||
|
||||
$img2 = [];
|
||||
$img2[] = $this->ql->find('img')->eq(1)->alt;
|
||||
$img2[] = $this->ql->find('img:eq(1)')->alt;
|
||||
$img2[] = $this->ql->find('.second_pic')->alt;
|
||||
|
||||
$this->assertCount(1,array_unique($img2));
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* @test
|
||||
*/
|
||||
public function find_dom_all_attr()
|
||||
{
|
||||
$imgAttr = $this->ql->find('img:eq(0)')->attr('*');
|
||||
$linkAttr = $this->ql->find('a:eq(1)')->attr('*');
|
||||
$this->assertCount(3,$imgAttr);
|
||||
$this->assertCount(1,$linkAttr);
|
||||
}
|
||||
}
|
43
tests/Dom/RulesTest.php
Normal file
43
tests/Dom/RulesTest.php
Normal file
@@ -0,0 +1,43 @@
|
||||
<?php
|
||||
/**
|
||||
* Created by PhpStorm.
|
||||
* User: Jaeger <JaegerCode@gmail.com>
|
||||
* Date: 18/12/12
|
||||
* Time: 下午12:25
|
||||
*/
|
||||
|
||||
namespace Tests\Dom;
|
||||
|
||||
|
||||
use QL\QueryList;
|
||||
use Tests\TestCaseBase;
|
||||
use Tightenco\Collect\Support\Collection;
|
||||
|
||||
class RulesTest extends TestCaseBase
|
||||
{
|
||||
protected $html;
|
||||
protected $ql;
|
||||
|
||||
protected function setUp(): void
|
||||
{
|
||||
$this->html = $this->getSnippet('snippet-2');
|
||||
$this->ql = QueryList::html($this->html);
|
||||
}
|
||||
|
||||
/**
|
||||
* @test
|
||||
*/
|
||||
public function get_data_by_rules()
|
||||
{
|
||||
$rules = [
|
||||
'a' => ['a','text'],
|
||||
'img_src' => ['img','src'],
|
||||
'img_alt' => ['img','alt']
|
||||
];
|
||||
$range = 'ul>li';
|
||||
$data = QueryList::rules($rules)->range($range)->html($this->html)->query()->getData();
|
||||
$this->assertInstanceOf(Collection::class,$data);
|
||||
$this->assertCount(3,$data);
|
||||
$this->assertEquals('http://querylist.com/2.jpg',$data[1]['img_src']);
|
||||
}
|
||||
}
|
103
tests/Feature/HttpTest.php
Normal file
103
tests/Feature/HttpTest.php
Normal file
@@ -0,0 +1,103 @@
|
||||
<?php
|
||||
/**
|
||||
* Created by PhpStorm.
|
||||
* User: x
|
||||
* Date: 2018/12/10
|
||||
* Time: 12:35 AM
|
||||
*/
|
||||
|
||||
namespace Tests\Feature;
|
||||
|
||||
|
||||
use GuzzleHttp\Handler\MockHandler;
|
||||
use GuzzleHttp\Psr7\Response;
|
||||
use QL\QueryList;
|
||||
use Tests\TestCaseBase;
|
||||
|
||||
class HttpTest extends TestCaseBase
|
||||
{
|
||||
protected $urls;
|
||||
|
||||
protected function setUp(): void
|
||||
{
|
||||
$this->urls = [
|
||||
'http://httpbin.org/get?name=php',
|
||||
'http://httpbin.org/get?name=golang',
|
||||
'http://httpbin.org/get?name=c++',
|
||||
'http://httpbin.org/get?name=java'
|
||||
];
|
||||
}
|
||||
|
||||
/**
|
||||
* @test
|
||||
*/
|
||||
public function can_post_json_data()
|
||||
{
|
||||
$mock = new MockHandler([new Response()]);
|
||||
$data = [
|
||||
'name' => 'foo'
|
||||
];
|
||||
QueryList::postJson('http://foo.com',$data,[
|
||||
'handler' => $mock
|
||||
]);
|
||||
$this->assertEquals((string)$mock->getLastRequest()->getBody(),json_encode($data));
|
||||
}
|
||||
|
||||
/**
|
||||
* @test
|
||||
*/
|
||||
public function concurrent_requests_base_use()
|
||||
{
|
||||
$urls = $this->urls;
|
||||
QueryList::getInstance()
|
||||
->multiGet($urls)
|
||||
->success(function(QueryList $ql,Response $response, $index) use($urls){
|
||||
$body = json_decode((string)$response->getBody(),true);
|
||||
$this->assertEquals($urls[$index],$body['url']);
|
||||
})->send();
|
||||
}
|
||||
|
||||
/**
|
||||
* @test
|
||||
*/
|
||||
public function concurrent_requests_advanced_use()
|
||||
{
|
||||
$ua = 'QueryList/4.0';
|
||||
|
||||
$errorUrl = 'http://web-site-not-exist.com';
|
||||
$urls = array_merge($this->urls,[$errorUrl]);
|
||||
|
||||
QueryList::rules([])
|
||||
->multiGet($urls)
|
||||
->concurrency(2)
|
||||
->withOptions([
|
||||
'timeout' => 60
|
||||
])
|
||||
->withHeaders([
|
||||
'User-Agent' => $ua
|
||||
])
|
||||
->success(function (QueryList $ql, Response $response, $index) use($ua){
|
||||
$body = json_decode((string)$response->getBody(),true);
|
||||
$this->assertEquals($ua,$body['headers']['User-Agent']);
|
||||
})
|
||||
->error(function (QueryList $ql, $reason, $index) use($urls,$errorUrl){
|
||||
$this->assertEquals($urls[$index],$errorUrl);
|
||||
})
|
||||
->send();
|
||||
}
|
||||
|
||||
/**
|
||||
* @test
|
||||
*/
|
||||
public function request_with_cache()
|
||||
{
|
||||
$url = $this->urls[0];
|
||||
$data = QueryList::get($url,null,[
|
||||
'cache' => sys_get_temp_dir(),
|
||||
'cache_ttl' => 600
|
||||
])->getHtml();
|
||||
$data = json_decode($data,true);
|
||||
$this->assertEquals($url,$data['url']);
|
||||
|
||||
}
|
||||
}
|
48
tests/Feature/InstanceTest.php
Normal file
48
tests/Feature/InstanceTest.php
Normal file
@@ -0,0 +1,48 @@
|
||||
<?php
|
||||
/**
|
||||
* Created by PhpStorm.
|
||||
* User: x
|
||||
* Date: 2018/12/9
|
||||
* Time: 11:10 PM
|
||||
*/
|
||||
|
||||
namespace Tests\Feature;
|
||||
|
||||
|
||||
use QL\QueryList;
|
||||
use Tests\TestCaseBase;
|
||||
|
||||
class InstanceTest extends TestCaseBase
|
||||
{
|
||||
protected $html;
|
||||
|
||||
protected function setUp(): void
|
||||
{
|
||||
$this->html = $this->getSnippet('snippet-1');
|
||||
}
|
||||
/**
|
||||
* @test
|
||||
*/
|
||||
public function singleton_instance_mode()
|
||||
{
|
||||
$ql = QueryList::getInstance()->html($this->html);
|
||||
$ql2 = QueryList::getInstance();
|
||||
$this->assertEquals($ql->getHtml(),$ql2->getHtml());
|
||||
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* @test
|
||||
*/
|
||||
public function get_new_object()
|
||||
{
|
||||
$ql = (new QueryList())->html($this->html);
|
||||
$ql2 = (new QueryList())->html('');
|
||||
$this->assertNotEquals($ql->getHtml(),$ql2->getHtml());
|
||||
|
||||
$ql = QueryList::range('')->html($this->html);
|
||||
$ql2 = QueryList::range('')->html('');
|
||||
$this->assertNotEquals($ql->getHtml(),$ql2->getHtml());
|
||||
}
|
||||
}
|
36
tests/Feature/MethodTest.php
Normal file
36
tests/Feature/MethodTest.php
Normal file
@@ -0,0 +1,36 @@
|
||||
<?php
|
||||
/**
|
||||
* Created by PhpStorm.
|
||||
* User: x
|
||||
* Date: 2018/12/10
|
||||
* Time: 1:14 AM
|
||||
*/
|
||||
|
||||
namespace Tests\Feature;
|
||||
|
||||
|
||||
use QL\QueryList;
|
||||
use Tests\TestCaseBase;
|
||||
|
||||
class MethodTest extends TestCaseBase
|
||||
{
|
||||
protected $html;
|
||||
|
||||
protected function setUp(): void
|
||||
{
|
||||
$this->html = $this->getSnippet('snippet-1');
|
||||
}
|
||||
|
||||
/**
|
||||
* @test
|
||||
*/
|
||||
public function pipe()
|
||||
{
|
||||
$html = $this->html;
|
||||
$qlHtml = QueryList::pipe(function(QueryList $ql) use($html){
|
||||
$ql->setHtml($html);
|
||||
return $ql;
|
||||
})->getHtml(false);
|
||||
$this->assertEquals($html,$qlHtml);
|
||||
}
|
||||
}
|
20
tests/TestCaseBase.php
Normal file
20
tests/TestCaseBase.php
Normal file
@@ -0,0 +1,20 @@
|
||||
<?php
|
||||
/**
|
||||
* Created by PhpStorm.
|
||||
* User: x
|
||||
* Date: 2018/12/9
|
||||
* Time: 11:43 PM
|
||||
*/
|
||||
|
||||
namespace Tests;
|
||||
|
||||
|
||||
use PHPUnit\Framework\TestCase;
|
||||
|
||||
class TestCaseBase extends TestCase
|
||||
{
|
||||
public function getSnippet($name)
|
||||
{
|
||||
return file_get_contents(__DIR__.'/assets/'.$name.'.html');
|
||||
}
|
||||
}
|
9
tests/assets/snippet-1.html
Normal file
9
tests/assets/snippet-1.html
Normal file
@@ -0,0 +1,9 @@
|
||||
<div id="one">
|
||||
<div class="two">
|
||||
<a href="http://querylist.cc">QueryList官网</a>
|
||||
<img src="http://querylist.com/1.jpg" alt="这是图片" abc="这是一个自定义属性">
|
||||
<img class="second_pic" src="http://querylist.com/2.jpg" alt="这是图片2">
|
||||
<a href="http://doc.querylist.cc">QueryList文档</a>
|
||||
</div>
|
||||
<span>其它的<b>一些</b>文本</span>
|
||||
</div>
|
16
tests/assets/snippet-2.html
Normal file
16
tests/assets/snippet-2.html
Normal file
@@ -0,0 +1,16 @@
|
||||
<div id="one">
|
||||
<ul>
|
||||
<li>
|
||||
<a href="http://querylist.cc">QueryList官网</a>
|
||||
<img src="http://querylist.com/1.jpg" alt="这是图片1" abc="这是一个自定义属性1">
|
||||
</li>
|
||||
<li>
|
||||
<a href="http://v3.querylist.cc">QueryList V3文档</a>
|
||||
<img src="http://querylist.com/2.jpg" alt="这是图片2" abc="这是一个自定义属性2">
|
||||
</li>
|
||||
<li>
|
||||
<a href="http://v4.querylist.cc">QueryList V4文档</a>
|
||||
<img src="http://querylist.com/3.jpg" alt="这是图片3" abc="这是一个自定义属性3">
|
||||
</li>
|
||||
</ul>
|
||||
</div>
|
5
tests/bootstrap.php
Normal file
5
tests/bootstrap.php
Normal file
@@ -0,0 +1,5 @@
|
||||
<?php
|
||||
|
||||
set_time_limit(0);
|
||||
|
||||
require __DIR__.'/../vendor/autoload.php';
|
Reference in New Issue
Block a user