72 Commits

Author SHA1 Message Date
Jaeger(黄杰)
894fb4344e Merge pull request #145 from maxiaozhi/master
正则匹配成功时才替换掉html
2021-08-08 13:04:55 +08:00
lion
e4fc716acd 正则匹配成功时才替换掉html 2021-07-18 23:37:35 +08:00
Jaeger(黄杰)
39dc0ca9c6 Merge pull request #143 from maxiaozhi/patch-1
Fix the matching exception
2021-07-05 14:07:58 +08:00
maxiaozhi
ef0a2efd4f Fix the matching exception
Fix the matching exception when the page contains multiple tags prefixed with head (for example: < head >, < header >)
2021-07-05 13:51:24 +08:00
huangjie
5953daac54 update collect 2020-12-14 10:39:28 +08:00
huangjie
465c6aefc7 update collect 2020-09-27 17:41:44 +08:00
Jaeger(黄杰)
92cb319d44 Update README-ZH.md 2020-07-18 13:06:29 +08:00
Jaeger(黄杰)
cbf3e0fcad Update README.md 2020-07-18 13:05:59 +08:00
Jaeger(黄杰)
cfa2d94a79 Update FUNDING.yml 2020-07-17 13:20:49 +08:00
Jaeger(黄杰)
47a444bf9e Create FUNDING.yml 2020-07-17 13:08:44 +08:00
Jaeger
85903fa9b5 feat: rules add attrs 2020-04-03 20:16:00 +08:00
Jaeger(黄杰)
e527c637c7 Merge pull request #110 from jae-jae/develop
replace collect()
2020-04-03 04:55:27 -05:00
Jaeger
f0a9798925 replace collect() 2020-04-03 17:33:32 +08:00
Jaeger
faea883c6f fix: data callback 2020-04-01 22:03:50 +08:00
Jaeger
c16826a573 updaed composer dependency 2020-03-23 18:15:04 +08:00
Jaeger
1492751f98 feat: optimization getHtml() 2020-03-22 17:19:57 +08:00
Jaeger
b7954b9aef fix: memory overflow 2020-03-20 13:26:40 +08:00
Jaeger
b3d84cf057 feat: modify the each function of class elements 2020-03-15 14:17:18 +08:00
Jaeger
52bbdeae14 Merge branch 'master' of github.com:jae-jae/QueryList into develop 2020-03-15 14:07:52 +08:00
Jaeger(黄杰)
25b2dbdc86 Merge pull request #105 from edwinhuish/add-each-function-same-as-collection
添加 each function 并和 Collection 保持一致,返回 false 时中断循环。
2020-03-15 01:07:22 -05:00
Jaeger
02c2b125d8 feat: elements class add htmlOuters function 2020-03-15 13:58:00 +08:00
Jaeger
fc8b701ef2 feat: optimize range results 2020-03-15 13:45:00 +08:00
Jaeger
75e436c73f feat: merge master 2020-03-15 11:30:35 +08:00
Jaeger(黄杰)
aa90e5a21d Merge pull request #106 from edwinhuish/destroy-old-phpquey-object-when-setHtml
destroy old phpquery object when setHtml
2020-03-14 22:28:13 -05:00
Jaeger
dd9af6881d feat: rules add texts and htmls attribute 2020-03-13 21:42:25 +08:00
Jaeger
b07d4bfc74 feat: rules add texts and htmls attribute 2020-03-13 21:39:42 +08:00
Edwin Xu
8c1614c4c3 destroy old phpquery object when setHtml 2020-03-13 16:08:55 +08:00
Jaeger
b387ef5bb0 feat: rules add htmlOuter attribute 2020-03-13 15:16:44 +08:00
Edwin Xu
67f0052c5d 添加 each function 并和 Collection 保持一致,返回 false 时中断循环。 2020-03-13 14:20:37 +08:00
Jaeger
7c86f82527 fix: optimize memory usage 2020-03-13 13:49:36 +08:00
Jaeger(黄杰)
6ee6a26aee Merge pull request #102 from edwinhuish/auto-destroy-phpquery-document
destroy phpquery document object when destruct Query class
2020-03-11 10:29:31 -05:00
Jaeger(黄杰)
116f19da65 Merge pull request #104 from edwinhuish/add-phpdoc
fix phpdoc
2020-03-11 10:20:22 -05:00
Edwin Xu
67cbd0f473 修复phpdoc 2020-03-10 21:36:55 +08:00
Edwin Xu
3eb26451c6 修复phpdoc 2020-03-10 21:03:25 +08:00
Edwin Xu
a76ecb4258 destroy phpquery document object when destruct Query class 2020-03-05 22:27:27 +08:00
Jaeger
46f564bc8b Updated phpQuery 2019-02-22 15:33:54 +08:00
Jaeger
df9e3bbf19 test htpp cache 2018-12-12 15:29:31 +08:00
Jaeger
0c85eed7ef add multiGet and multiPost 2018-12-11 17:52:41 +08:00
Jaeger
df521923ac Concurrent requests 2018-12-11 00:00:17 +08:00
Jaeger
a779ef71f3 add MultiRequest 2018-12-10 19:23:15 +08:00
Jaeger
c32736bd9e add pipe 2018-12-10 01:27:48 +08:00
Jaeger
661bc3168d add phpunit 2018-12-10 00:13:16 +08:00
Jaeger
6d182ff061 remove instance 2018-12-07 00:35:58 +08:00
Jaeger
1c2e3f4adf add queryData() 2018-10-15 18:52:12 +08:00
Jaeger
1d73895981 single instance 2017-12-15 11:05:32 +08:00
Jaeger
03e6a955bf add https verify false 2017-12-14 10:31:47 +08:00
Jaeger
72a7543da3 fix laravel conflict bug 2017-11-15 10:46:51 +08:00
Jaeger
9d04003d73 fix laravel conflict bug 2017-11-15 10:43:28 +08:00
Jaeger
31ec950cdc ok 2017-10-09 11:27:08 +08:00
Jaeger
18bc6daea4 ok 2017-10-09 02:44:07 +08:00
Jaeger
f2c6ce7385 add comments 2017-10-09 01:48:56 +08:00
Jaeger
c0ed870dc8 ok 2017-10-08 23:01:22 +08:00
Jaeger
a4d0087e47 update README 2017-10-08 22:48:06 +08:00
Jaeger
a0f7b9aa3e ok 2017-10-02 10:30:24 +08:00
Jaeger
d812c47ede update 2017-10-01 23:37:09 +08:00
Jaeger
47c0f37233 update README 2017-10-01 12:49:01 +08:00
Jaeger
967ef10f23 ok 2017-10-01 01:14:32 +08:00
Jaeger
c82eb3c557 ok 2017-10-01 01:13:39 +08:00
Jaeger
f68cc2e218 add EN README 2017-10-01 01:11:47 +08:00
Jaeger
684e52c70e ok 2017-10-01 00:23:34 +08:00
Jaeger
777d837f18 update README 2017-09-30 21:49:07 +08:00
Jaeger
6e9a202ac2 update README 2017-09-30 21:46:31 +08:00
Jaeger
e885eece26 ok 2017-09-30 12:09:51 +08:00
Jaeger
aeeec5367e ok 2017-09-30 12:04:27 +08:00
Jaeger
c42a7b1766 ok 2017-09-30 12:02:25 +08:00
Jaeger
a3a830a744 add logo 2017-09-30 12:01:15 +08:00
Jaeger
7381ec21d3 update REMADE 2017-09-30 11:32:09 +08:00
Jaeger
95102a5ce2 ok 2017-09-30 01:41:09 +08:00
Jaeger
520195c929 update COMMUNITY 2017-09-30 01:39:16 +08:00
Jaeger
75799decc3 add COMMUNITY 2017-09-30 01:12:00 +08:00
Jaeger
33c574cdb9 ok 2017-09-29 23:47:35 +08:00
Jaeger
47a777789b ok 2017-09-29 18:43:24 +08:00
25 changed files with 1344 additions and 300 deletions

12
.github/FUNDING.yml vendored Normal file
View File

@@ -0,0 +1,12 @@
# These are supported funding model platforms
github: # Replace with up to 4 GitHub Sponsors-enabled usernames e.g., [user1, user2]
patreon: # Replace with a single Patreon username
open_collective: querylist # Replace with a single Open Collective username
ko_fi: # Replace with a single Ko-fi username
tidelift: # Replace with a single Tidelift platform-name/package-name e.g., npm/babel
community_bridge: # Replace with a single Community Bridge project-name e.g., cloud-foundry
liberapay: # Replace with a single Liberapay username
issuehunt: # Replace with a single IssueHunt username
otechie: # Replace with a single Otechie username
custom: # Replace with up to 4 custom sponsorship URLs e.g., ['link1', 'link2']

4
.gitignore vendored
View File

@@ -1,3 +1,5 @@
/vendor/
.idea/
composer.lock
composer.lock
.DS_Store
*.cache

309
README-ZH.md Normal file
View File

@@ -0,0 +1,309 @@
<p align="center">
<img width="150" src="logo.png" alt="QueryList">
<br>
<br>
</p>
# QueryList 简介
`QueryList`是一套简洁、优雅、可扩展的PHP采集工具(爬虫)基于phpQuery。
## 特性
- 拥有与jQuery完全相同的CSS3 DOM选择器
- 拥有与jQuery完全相同的DOM操作API
- 拥有通用的列表采集方案
- 拥有强大的HTTP请求套件轻松实现如模拟登陆、伪造浏览器、HTTP代理等意复杂的网络请求
- 拥有乱码解决方案
- 拥有强大的内容过滤功能可使用jQuey选择器来过滤内容
- 拥有高度的模块化设计,扩展性强
- 拥有富有表现力的API
- 拥有高质量文档
- 拥有丰富的插件
- 拥有专业的问答社区和交流群
通过插件可以轻松实现诸如:
- 多线程采集
- 采集JavaScript动态渲染的页面 (PhantomJS/headless WebKit)
- 图片本地化
- 模拟浏览器行为提交Form表单
- 网络爬虫
- .....
## 环境要求
- PHP >= 7.1
> 如果你的PHP版本还停留在PHP5或者不会使用Composer,你可以选择使用QueryList3,QueryList3支持php5.3以及手动安装。
QueryList3 文档:http://v3.querylist.cc
## 安装
通过Composer安装:
```
composer require jaeger/querylist
```
## 使用
#### 元素操作
- 采集「昵图网」所有图片地址
```php
QueryList::get('http://www.nipic.com')->find('img')->attrs('src');
```
- 采集百度搜索结果
```php
$ql = QueryList::get('http://www.baidu.com/s?wd=QueryList');
$ql->find('title')->text(); // 获取网站标题
$ql->find('meta[name=keywords]')->content; // 获取网站头部关键词
$ql->find('h3>a')->texts(); //获取搜索结果标题列表
$ql->find('h3>a')->attrs('href'); //获取搜索结果链接列表
$ql->find('img')->src; //获取第一张图片的链接地址
$ql->find('img:eq(1)')->src; //获取第二张图片的链接地址
$ql->find('img')->eq(2)->src; //获取第三张图片的链接地址
// 遍历所有图片
$ql->find('img')->map(function($img){
echo $img->alt; //打印图片的alt属性
});
```
- 更多用法
```php
$ql->find('#head')->append('<div>追加内容</div>')->find('div')->htmls();
$ql->find('.two')->children('img')->attrs('alt'); //获取class为two元素下的所有img孩子节点
//遍历class为two元素下的所有孩子节点
$data = $ql->find('.two')->children()->map(function ($item){
//用is判断节点类型
if($item->is('a')){
return $item->text();
}elseif($item->is('img'))
{
return $item->alt;
}
});
$ql->find('a')->attr('href', 'newVal')->removeClass('className')->html('newHtml')->...
$ql->find('div > p')->add('div > ul')->filter(':has(a)')->find('p:first')->nextAll()->andSelf()->...
$ql->find('div.old')->replaceWith( $ql->find('div.new')->clone())->appendTo('.trash')->prepend('Deleted')->...
```
#### 列表采集
采集百度搜索结果列表的标题和链接:
```php
$data = QueryList::get('http://www.baidu.com/s?wd=QueryList')
// 设置采集规则
->rules([
'title'=>array('h3','text'),
'link'=>array('h3>a','href')
])
->query()->getData();
print_r($data->all());
```
采集结果:
```
Array
(
[0] => Array
(
[title] => QueryList|基于phpQuery的无比强大的PHP采集工具
[link] => http://www.baidu.com/link?url=GU_YbDT2IHk4ns1tjG2I8_vjmH0SCJEAPuuZN
)
[1] => Array
(
[title] => PHP 用QueryList抓取网页内容 - wb145230 - 博客园
[link] => http://www.baidu.com/link?url=zn0DXBnrvIF2ibRVW34KcRVFG1_bCdZvqvwIhUqiXaS
)
[2] => Array
(
[title] => 介绍- QueryList指导文档
[link] => http://www.baidu.com/link?url=pSypvMovqS4v2sWeQo5fDBJ4EoYhXYi0Lxx
)
//...
)
```
#### 编码转换
```php
// 输出编码:UTF-8,输入编码:GB2312
QueryList::get('https://top.etao.com')->encoding('UTF-8','GB2312')->find('a')->texts();
// 输出编码:UTF-8,输入编码:自动识别
QueryList::get('https://top.etao.com')->encoding('UTF-8')->find('a')->texts();
```
#### HTTP网络操作GuzzleHttp
- 携带cookie登录新浪微博
```php
//采集新浪微博需要登录才能访问的页面
$ql = QueryList::get('http://weibo.com','param1=testvalue & params2=somevalue',[
'headers' => [
//填写从浏览器获取到的cookie
'Cookie' => 'SINAGLOBAL=546064; wb_cmtLike_2112031=1; wvr=6;....'
]
]);
//echo $ql->getHtml();
echo $ql->find('title')->text();
//输出: 我的首页 微博-随时随地发现新鲜事
```
- 使用Http代理
```php
$urlParams = ['param1' => 'testvalue','params2' => 'somevalue'];
$opts = [
// 设置http代理
'proxy' => 'http://222.141.11.17:8118',
//设置超时时间,单位:秒
'timeout' => 30,
// 伪造http头
'headers' => [
'Referer' => 'https://querylist.cc/',
'User-Agent' => 'testing/1.0',
'Accept' => 'application/json',
'X-Foo' => ['Bar', 'Baz'],
'Cookie' => 'abc=111;xxx=222'
]
];
$ql->get('http://httpbin.org/get',$urlParams,$opts);
// echo $ql->getHtml();
```
- 模拟登录
```php
// 用post登录
$ql = QueryList::post('http://xxxx.com/login',[
'username' => 'admin',
'password' => '123456'
])->get('http://xxx.com/admin');
//采集需要登录才能访问的页面
$ql->get('http://xxx.com/admin/page');
//echo $ql->getHtml();
```
#### Form表单操作
模拟登陆GitHub
```php
// 获取QueryList实例
$ql = QueryList::getInstance();
//获取到登录表单
$form = $ql->get('https://github.com/login')->find('form');
//填写GitHub用户名和密码
$form->find('input[name=login]')->val('your github username or email');
$form->find('input[name=password]')->val('your github password');
//序列化表单数据
$fromData = $form->serializeArray();
$postData = [];
foreach ($fromData as $item) {
$postData[$item['name']] = $item['value'];
}
//提交登录表单
$actionUrl = 'https://github.com'.$form->attr('action');
$ql->post($actionUrl,$postData);
//判断登录是否成功
// echo $ql->getHtml();
$userName = $ql->find('.header-nav-current-user>.css-truncate-target')->text();
if($userName)
{
echo '登录成功!欢迎你:'.$userName;
}else{
echo '登录失败!';
}
```
#### Bind功能扩展
自定义扩展一个`myHttp`方法:
```php
$ql = QueryList::getInstance();
//绑定一个myHttp方法到QueryList对象
$ql->bind('myHttp',function ($url){
// $this 为当前的QueryList对象
$html = file_get_contents($url);
$this->setHtml($html);
return $this;
});
//然后就可以通过注册的名字来调用
$data = $ql->myHttp('https://toutiao.io')->find('h3 a')->texts();
print_r($data->all());
```
或者把实现体封装到class然后这样绑定:
```php
$ql->bind('myHttp',function ($url){
return new MyHttp($this,$url);
});
```
#### 插件使用
- 使用PhantomJS插件采集JavaScript动态渲染的页面:
```php
// 安装时设置PhantomJS二进制文件路径
$ql = QueryList::use(PhantomJs::class,'/usr/local/bin/phantomjs');
// 采集今日头条手机版
$data = $ql->browser('https://m.toutiao.com')->find('p')->texts();
print_r($data->all());
// 使用HTTP代理
$ql->browser('https://m.toutiao.com',false,[
'--proxy' => '192.168.1.42:8080',
'--proxy-type' => 'http'
])
```
- 使用CURL多线程插件,多线程采集GitHub排行榜:
```php
$ql = QueryList::use(CurlMulti::class);
$ql->curlMulti([
'https://github.com/trending/php',
'https://github.com/trending/go',
//.....more urls
])
// 每个任务成功完成调用此回调
->success(function (QueryList $ql,CurlMulti $curl,$r){
echo "Current url:{$r['info']['url']} \r\n";
$data = $ql->find('h3 a')->texts();
print_r($data->all());
})
// 每个任务失败回调
->error(function ($errorInfo,CurlMulti $curl){
echo "Current url:{$errorInfo['info']['url']} \r\n";
print_r($errorInfo['error']);
})
->start([
// 最大并发数
'maxThread' => 10,
// 错误重试次数
'maxTry' => 3,
]);
```
## 插件
- [jae-jae/QueryList-PhantomJS](https://github.com/jae-jae/QueryList-PhantomJS): 使用PhantomJS采集JavaScript动态渲染的页面
- [jae-jae/QueryList-CurlMulti](https://github.com/jae-jae/QueryList-CurlMulti) : Curl多线程采集
- [jae-jae/QueryList-AbsoluteUrl](https://github.com/jae-jae/QueryList-AbsoluteUrl) : 转换URL相对路径到绝对路径
- [jae-jae/QueryList-Rule-Google](https://github.com/jae-jae/QueryList-Rule-Google) : 谷歌搜索引擎
- [jae-jae/QueryList-Rule-Baidu](https://github.com/jae-jae/QueryList-Rule-Baidu) : 百度搜索引擎
查看更多的QueryList插件和基于QueryList的产品:[QueryList社区力量](https://github.com/jae-jae/QueryList-Community)
## 贡献
欢迎为QueryList贡献代码。关于贡献插件可以查看:[QueryList插件贡献说明](https://github.com/jae-jae/QueryList-Community/blob/master/CONTRIBUTING.md)
## 寻求帮助?
- QueryList主页: [http://querylist.cc](http://querylist.cc/)
- QueryList文档: [http://doc.querylist.cc](http://doc.querylist.cc/)
- QueryList问答:[http://wenda.querylist.cc](http://wenda.querylist.cc/)
- QueryList交流QQ群:123266961 <a target="_blank" href="http://shang.qq.com/wpa/qunwpa?idkey=a1b248ae30b3f711bdab4f799df839300dc7fed54331177035efa0513da027f6"><img border="0" src="http://pub.idqqimg.com/wpa/images/group.png" alt="cafeEX" title="cafeEX"></a>
- GitHub:https://github.com/jae-jae/QueryList
- Git@OSC:http://git.oschina.net/jae/QueryList
## Author
Jaeger <JaegerCode@gmail.com>
## Lisence
QueryList is licensed under the license of MIT. See the LICENSE for more details.

248
README.md
View File

@@ -1,73 +1,79 @@
# QueryList 简介
`QueryList`是一套简洁、优雅的PHP采集工具基于phpQuery。
<p align="center">
<img width="150" src="logo.png" alt="QueryList">
<br>
<br>
</p>
## 特性
- 拥有与jQuery完全相同的CSS3 DOM选择器
- 拥有与jQuery完全相同的DOM操作API
- 拥有通用的列表采集方案
- 拥有强大的HTTP请求套件轻松实现如模拟登陆、伪造浏览器、HTTP代理等意复杂的网络请求
- 拥有乱码解决方案
- 拥有强大的内容过滤功能可使用jQuey选择器来过滤内容
- 拥有高度的模块化设计,扩展性强
- 拥有富有表现力的API
- 拥有高质量文档
- 拥有丰富的插件
- 拥有专业的问答社区和交流群
# QueryList
`QueryList` is a simple, elegant, extensible PHP Web Scraper (crawler/spider) ,based on phpQuery.
通过插件可以轻松实现诸如:
- 多线程采集
- 图片本地化
- 模拟浏览器行为提交Form表单
- 网络爬虫
[API Documentation](https://github.com/jae-jae/QueryList/wiki)
[中文文档](README-ZH.md)
## Features
- Have the same CSS3 DOM selector as jQuery
- Have the same DOM manipulation API as jQuery
- Have a generic list crawling program
- Have a strong HTTP request suite, easy to achieve such as: simulated landing, forged browser, HTTP proxy and other complex network requests
- Have a messy code solution
- Have powerful content filtering, you can use the jQuey selector to filter content
- Has a high degree of modular design, scalability and strong
- Have an expressive API
- Has a wealth of plug-ins
Through plug-ins you can easily implement things like:
- Multithreaded crawl
- Crawl JavaScript dynamic rendering page (PhantomJS/headless WebKit)
- Image downloads to local
- Simulate browser behavior such as submitting Form forms
- Web crawler
- .....
## 环境要求
- PHP >= 7.0
## Requirements
- PHP >= 7.1
> 如果你的PHP版本还停留在PHP5或者不会使用Composer,你可以选择使用QueryList3,QueryList3支持php5.3以及手动安装。
QueryList3 文档:http://v3.querylist.cc
## 安装
通过Composer安装:
## Installation
By Composer installation:
```
composer require jaeger/querylist:dev-master
composer require jaeger/querylist
```
## 使用
## Usage
#### 元素操作
- 采集「昵图网」所有图片地址
#### DOM Traversal and Manipulation
- Crawl「GitHub」all picture links
```php
QueryList::get('http://www.nipic.com')->find('img')->attrs('src');
QueryList::get('https://github.com')->find('img')->attrs('src');
```
- 采集百度搜索结果
- Crawl Google search results
```php
$ql = QueryList::get('http://www.baidu.com/s?wd=QueryList');
$ql = QueryList::get('https://www.google.co.jp/search?q=QueryList');
$ql->find('title')->text(); // 获取网站标题
$ql->find('meta[name=keywords]')->content; // 获取网站头部关键词
$ql->find('title')->text(); //The page title
$ql->find('meta[name=keywords]')->content; //The page keywords
$ql->find('h3>a')->texts(); //获取搜索结果标题列表
$ql->find('h3>a')->attrs('href'); //获取搜索结果链接列表
$ql->find('h3>a')->texts(); //Get a list of search results titles
$ql->find('h3>a')->attrs('href'); //Get a list of search results links
$ql->find('img')->src; //获取第一张图片的链接地址
$ql->find('img:eq(1)')->src; //获取第二张图片的链接地址
$ql->find('img')->eq(2)->src; //获取第三张图片的链接地址
// 遍历所有图片
$ql->find('img')->src; //Gets the link address of the first image
$ql->find('img:eq(1)')->src; //Gets the link address of the second image
$ql->find('img')->eq(2)->src; //Gets the link address of the third image
// Loop all the images
$ql->find('img')->map(function($img){
echo $img->alt; //打印图片的alt属性
echo $img->alt; //Print the alt attribute of the image
});
```
- 更多用法
- More usage
```php
$ql->find('#head')->append('<div>追加内容</div>')->find('div')->htmls();
$ql->find('.two')->children('img')->attrs('alt'); //获取class为two元素下的所有img孩子节点
//遍历class为two元素下的所有孩子节点
$ql->find('#head')->append('<div>Append content</div>')->find('div')->htmls();
$ql->find('.two')->children('img')->attrs('alt'); // Get the class is the "two" element under all img child nodes
// Loop class is the "two" element under all child nodes
$data = $ql->find('.two')->children()->map(function ($item){
//用is判断节点类型
// Use "is" to determine the node type
if($item->is('a')){
return $item->text();
}elseif($item->is('img'))
@@ -80,11 +86,11 @@ $ql->find('a')->attr('href', 'newVal')->removeClass('className')->html('newHtml'
$ql->find('div > p')->add('div > ul')->filter(':has(a)')->find('p:first')->nextAll()->andSelf()->...
$ql->find('div.old')->replaceWith( $ql->find('div.new')->clone())->appendTo('.trash')->prepend('Deleted')->...
```
#### 列表采集
采集百度搜索结果列表的标题和链接:
#### List crawl
Crawl the title and link of the Google search results list:
```php
$data = QueryList::get('http://www.baidu.com/s?wd=QueryList')
// 设置采集规则
$data = QueryList::get('https://www.google.co.jp/search?q=QueryList')
// Set the crawl rules
->rules([
'title'=>array('h3','text'),
'link'=>array('h3>a','href')
@@ -93,60 +99,62 @@ $data = QueryList::get('http://www.baidu.com/s?wd=QueryList')
print_r($data->all());
```
采集结果:
Results:
```
Array
(
[0] => Array
(
[title] => QueryList|基于phpQuery的无比强大的PHP采集工具
[link] => http://www.baidu.com/link?url=GU_YbDT2IHk4ns1tjG2I8_vjmH0SCJEAPuuZN
[title] => Angular - QueryList
[link] => https://angular.io/api/core/QueryList
)
[1] => Array
(
[title] => PHP 用QueryList抓取网页内容 - wb145230 - 博客园
[link] => http://www.baidu.com/link?url=zn0DXBnrvIF2ibRVW34KcRVFG1_bCdZvqvwIhUqiXaS
[title] => QueryList | @angular/core - Angularリファレンス - Web Creative Park
[link] => http://www.webcreativepark.net/angular/querylist/
)
[2] => Array
(
[title] => 介绍- QueryList指导文档
[link] => http://www.baidu.com/link?url=pSypvMovqS4v2sWeQo5fDBJ4EoYhXYi0Lxx
[title] => QueryListにQueryを追加したり、追加されたことを感知する | TIPS ...
[link] => http://www.webcreativepark.net/angular/querylist_query_add_subscribe/
)
//...
)
```
#### 编码转换
#### Encode convert
```php
// 输出编码:UTF-8,输入编码:GB2312
// Out charset :UTF-8
// In charset :GB2312
QueryList::get('https://top.etao.com')->encoding('UTF-8','GB2312')->find('a')->texts();
// 输出编码:UTF-8,输入编码:自动识别
// Out charset:UTF-8
// In charset:Automatic Identification
QueryList::get('https://top.etao.com')->encoding('UTF-8')->find('a')->texts();
```
#### HTTP网络操作
- 携带cookie登录新浪微博
#### HTTP Client (GuzzleHttp)
- Carry cookie login GitHub
```php
//采集新浪微博需要登录才能访问的页面
$ql = QueryList::get('http://weibo.com','param1=testvalue & params2=somevalue',[
'headers' => [
//填写从浏览器获取到的cookie
'Cookie' => 'SINAGLOBAL=546064; wb_cmtLike_2112031=1; wvr=6;....'
]
//Crawl GitHub content
$ql = QueryList::get('https://github.com','param1=testvalue & params2=somevalue',[
'headers' => [
// Fill in the cookie from the browser
'Cookie' => 'SINAGLOBAL=546064; wb_cmtLike_2112031=1; wvr=6;....'
]
]);
//echo $ql->getHtml();
echo $ql->find('title')->text();
//输出: 我的首页 微博-随时随地发现新鲜事
$userName = $ql->find('.header-nav-current-user>.css-truncate-target')->text();
echo $userName;
```
- 使用Http代理
- Use the Http proxy
```php
$urlParams = ['param1' => 'testvalue','params2' => 'somevalue'];
$opts = [
// 设置http代理
// Set the http proxy
'proxy' => 'http://222.141.11.17:8118',
//设置超时时间,单位:秒
//Set the timeout time in seconds
'timeout' => 30,
// 伪造http头
// Fake HTTP headers
'headers' => [
'Referer' => 'https://querylist.cc/',
'User-Agent' => 'testing/1.0',
@@ -159,75 +167,94 @@ $ql->get('http://httpbin.org/get',$urlParams,$opts);
// echo $ql->getHtml();
```
- 模拟登录
- Analog login
```php
// 用post登录
// Post login
$ql = QueryList::post('http://xxxx.com/login',[
'username' => 'admin',
'password' => '123456'
])->get('http://xxx.com/admin');
//采集需要登录才能访问的页面
// Crawl pages that need to be logged in to access
$ql->get('http://xxx.com/admin/page');
//echo $ql->getHtml();
```
#### Form表单操作
模拟登陆GitHub
#### Submit forms
Login GitHub
```php
// 获取QueryList实例
// Get the QueryList instance
$ql = QueryList::getInstance();
//获取到登录表单
// Get the login form
$form = $ql->get('https://github.com/login')->find('form');
//填写GitHub用户名和密码
// Fill in the GitHub username and password
$form->find('input[name=login]')->val('your github username or email');
$form->find('input[name=password]')->val('your github password');
//序列化表单数据
// Serialize the form data
$fromData = $form->serializeArray();
$postData = [];
foreach ($fromData as $item) {
$postData[$item['name']] = $item['value'];
}
//提交登录表单
// Submit the login form
$actionUrl = 'https://github.com'.$form->attr('action');
$ql->post($actionUrl,$postData);
//判断登录是否成功
// To determine whether the login is successful
// echo $ql->getHtml();
$userName = $ql->find('.header-nav-current-user>.css-truncate-target')->text();
if($userName)
{
echo '登录成功!欢迎你:'.$userName;
echo 'Login successful ! Welcome:'.$userName;
}else{
echo '登录失败!';
echo 'Login failed !';
}
```
#### Bind功能扩展
自定义扩展一个`myHttp`方法:
#### Bind function extension
Customize the extension of a `myHttp` method:
```php
$ql = QueryList::getInstance();
//绑定一个myHttp方法到QueryList对象
//Bind a `myHttp` method to the QueryList object
$ql->bind('myHttp',function ($url){
// $this is the current QueryList object
$html = file_get_contents($url);
$this->setHtml($html);
return $this;
});
//然后就可以通过注册的名字来调用
// And then you can call by the name of the binding
$data = $ql->myHttp('https://toutiao.io')->find('h3 a')->texts();
print_r($data->all());
```
或者把实现体封装到class然后这样绑定:
Or package to class, and then bind:
```php
$ql->bind('myHttp',function ($url){
return new MyHttp($this,$url);
});
```
#### 插件使用
使用CURL多线程插件,多线程采集GitHub排行榜:
#### Plugin used
- Use the PhantomJS plugin to crawl JavaScript dynamically rendered pages:
```php
// Set the PhantomJS binary file path during installation
$ql = QueryList::use(PhantomJs::class,'/usr/local/bin/phantomjs');
// Crawl「500px」all picture links
$data = $ql->browser('https://500px.com/editors')->find('img')->attrs('src');
print_r($data->all());
// Use the HTTP proxy
$ql->browser('https://500px.com/editors',false,[
'--proxy' => '192.168.1.42:8080',
'--proxy-type' => 'http'
])
```
- Using the CURL multithreading plug-in, multi-threaded crawling GitHub trending :
```php
$ql = QueryList::use(CurlMulti::class);
$ql->curlMulti([
@@ -235,40 +262,43 @@ $ql->curlMulti([
'https://github.com/trending/go',
//.....more urls
])
// 每个任务成功完成调用此回调
// Called if task is success
->success(function (QueryList $ql,CurlMulti $curl,$r){
echo "Current url:{$r['info']['url']} \r\n";
$data = $ql->find('h3 a')->texts();
print_r($data->all());
})
// 每个任务失败回调
// Task fail callback
->error(function ($errorInfo,CurlMulti $curl){
echo "Current url:{$errorInfo['info']['url']} \r\n";
print_r($errorInfo['error']);
})
->start([
// 最大并发数
// Maximum number of threads
'maxThread' => 10,
// 错误重试次数
// Number of error retries
'maxTry' => 3,
]);
```
## 插件
- [jae-jae/QueryList-AbsoluteUrl](https://github.com/jae-jae/QueryList-AbsoluteUrl) : 转换URL相对路径到绝对路径
- [jae-jae/QueryList-CurlMulti](https://github.com/jae-jae/QueryList-CurlMulti) : Curl多线程采集
## Plugins
- [jae-jae/QueryList-PhantomJS](https://github.com/jae-jae/QueryList-PhantomJS):Use PhantomJS to crawl Javascript dynamically rendered page.
- [jae-jae/QueryList-CurlMulti](https://github.com/jae-jae/QueryList-CurlMulti) : Curl multi threading.
- [jae-jae/QueryList-AbsoluteUrl](https://github.com/jae-jae/QueryList-AbsoluteUrl) : Converting relative urls to absolute.
- [jae-jae/QueryList-Rule-Google](https://github.com/jae-jae/QueryList-Rule-Google) : Google searcher.
- [jae-jae/QueryList-Rule-Baidu](https://github.com/jae-jae/QueryList-Rule-Baidu) : Baidu searcher.
## 寻求帮助?
- QueryList交流社区: [http://querylist.cc/](http://querylist.cc/)
- QueryList文档: [http://doc.querylist.cc/](http://doc.querylist.cc/)
- QueryList交流QQ群:123266961 <a target="_blank" href="http://shang.qq.com/wpa/qunwpa?idkey=a1b248ae30b3f711bdab4f799df839300dc7fed54331177035efa0513da027f6"><img border="0" src="http://pub.idqqimg.com/wpa/images/group.png" alt="╰☆邪恶 魔方☆" title="╰☆邪恶 魔方☆"></a>
- Git@OSC:http://git.oschina.net/jae/QueryList
- GitHub:https://github.com/jae-jae/QueryList
View more QueryList plugins and QueryList-based products: [QueryList Community](https://github.com/jae-jae/QueryList-Community)
## Contributing
Welcome to contribute code for the QueryList。About Contributing Plugins can be viewed:[QueryList Plugin Contributing Guide](https://github.com/jae-jae/QueryList-Community/blob/master/CONTRIBUTING.md)
## Author
Jaeger <JaegerCode@gmail.com>
If this library is useful for you, say thanks [buying me a beer :beer:](https://www.paypal.me/jaepay)!
## Lisence
QueryList is licensed under the license of MIT. See the LICENSE for more details.
QueryList is licensed under the license of MIT. See the LICENSE for more details.

View File

@@ -1,16 +1,17 @@
{
"name": "jaeger/querylist",
"description": "QueryList是基于phpQuery的无比强大的PHP采集工具",
"description": "Simple, elegant, extensible PHP Web Scraper (crawler/spider),Use the css3 dom selector,Based on phpQuery! 简洁、优雅、可扩展的PHP采集工具(爬虫)基于phpQuery。",
"keywords":["QueryList","phpQuery","spider"],
"homepage": "http://querylist.cc",
"require": {
"PHP":">=7.0",
"jaeger/phpquery-single": "^0.9",
"tightenco/collect": "^5.5",
"jaeger/g-http": "^1.1"
"PHP":">=7.1",
"jaeger/phpquery-single": "^1",
"jaeger/g-http": "^1.1",
"ext-dom": "*",
"tightenco/collect": ">5.0"
},
"suggest":{
"monolog/monolog":"Log support"
},
"license": "MIT",
"authors": [
@@ -24,7 +25,16 @@
"QL\\":"src"
}
},
"autoload-dev": {
"psr-4": {
"Tests\\": "tests/"
}
},
"require-dev": {
"symfony/var-dumper": "^3.3"
"symfony/var-dumper": "^3.3",
"phpunit/phpunit": "^8.5"
},
"scripts": {
"test": "./vendor/bin/phpunit"
}
}

BIN
logo.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 15 KiB

19
phpunit.xml Normal file
View File

@@ -0,0 +1,19 @@
<phpunit
bootstrap="vendor/autoload.php"
convertErrorsToExceptions="true"
convertNoticesToExceptions="true"
convertWarningsToExceptions="true"
>
<testsuites>
<testsuite name="querylist">
<directory>./tests</directory>
</testsuite>
</testsuites>
<filter>
<whitelist>
<directory suffix=".php">src</directory>
</whitelist>
</filter>
</phpunit>

View File

@@ -7,6 +7,7 @@
namespace QL;
use Closure;
use Tightenco\Collect\Support\Collection;
class Config
{
@@ -20,17 +21,29 @@ class Config
*/
public function __construct()
{
$this->plugins = collect();
$this->binds = collect();
$this->plugins = new Collection();
$this->binds = new Collection();
}
/**
* Get the Config instance
*
* @return null|Config
*/
public static function getInstance()
{
self::$instance || self::$instance = new self();
return self::$instance;
}
/**
* Global installation plugin
*
* @param $plugins
* @param array ...$opt
* @return $this
*/
public function use($plugins,...$opt)
{
if(is_string($plugins)){
@@ -41,6 +54,13 @@ class Config
return $this;
}
/**
* Global binding custom method
*
* @param string $name
* @param Closure $provider
* @return $this
*/
public function bind(string $name, Closure $provider)
{
$this->binds[$name] = $provider;

View File

@@ -7,70 +7,72 @@
namespace QL\Dom;
use phpDocumentor\Reflection\Types\Null_;
use phpQueryObject;
use Tightenco\Collect\Support\Collection;
/**
* Class Elements
* @package QL\Dom
*
* @method Elements toReference($var)
* @method documentFragment($state)
* @method Elements toReference(&$var)
* @method Elements documentFragment($state = null)
* @method Elements toRoot()
* @method Elements getDocumentIDRef($documentID)
* @method Elements getDocumentIDRef(&$documentID)
* @method Elements getDocument()
* @method getDOMDocument()
* @method \DOMDocument getDOMDocument()
* @method Elements getDocumentID()
* @method Elements unloadDocument()
* @method isHTML()
* @method isXHTML()
* @method isXML()
* @method serialize()
* @method serializeArray($submit)
* @method get($index,$callback1,$callback2,$callback3)
* @method getString($index,$callback1,$callback2,$callback3)
* @method getStrings($index,$callback1,$callback2,$callback3)
* @method newInstance($newStack)
* @method Elements find($selectors,$context,$noHistory)
* @method Elements is($selector,$nodes)
* @method Elements filterCallback($callback,$_skipHistory)
* @method Elements filter($selectors,$_skipHistory)
* @method load($url,$data,$callback)
* @method Elements trigger($type,$data)
* @method Elements triggerHandler($type,$data)
* @method Elements bind($type,$data,$callback)
* @method unbind($type,$callback)
* @method Elements change($callback)
* @method Elements submit($callback)
* @method Elements click($callback)
* @method bool isHTML()
* @method bool isXHTML()
* @method bool isXML()
* @method string serialize()
* @method array serializeArray($submit = null)
* @method \DOMElement|\DOMElement[] get($index = null, $callback1 = null, $callback2 = null, $callback3 = null)
* @method string|array getString($index = null, $callback1 = null, $callback2 = null, $callback3 = null)
* @method string|array getStrings($index = null, $callback1 = null, $callback2 = null, $callback3 = null)
* @method Elements newInstance($newStack = null)
* @method Elements find($selectors, $context = null, $noHistory = false)
* @method Elements|bool is($selector, $nodes = null)
* @method Elements filterCallback($callback, $_skipHistory = false)
* @method Elements filter($selectors, $_skipHistory = false)
* @method Elements load($url, $data = null, $callback = null)
* @method Elements trigger($type, $data = [])
* @method Elements triggerHandler($type, $data = [])
* @method Elements bind($type, $data, $callback = null)
* @method Elements unbind($type = null, $callback = null)
* @method Elements change($callback = null)
* @method Elements submit($callback = null)
* @method Elements click($callback = null)
* @method Elements wrapAllOld($wrapper)
* @method Elements wrapAll($wrapper)
* @method Elements wrapAllPHP($codeBefore,$codeAfter)
* @method Elements wrapAllPHP($codeBefore, $codeAfter)
* @method Elements wrap($wrapper)
* @method Elements wrapPHP($codeBefore,$codeAfter)
* @method Elements wrapPHP($codeBefore, $codeAfter)
* @method Elements wrapInner($wrapper)
* @method Elements wrapInnerPHP($codeBefore,$codeAfter)
* @method Elements wrapInnerPHP($codeBefore, $codeAfter)
* @method Elements contents()
* @method Elements contentsUnwrap()
* @method switchWith($markup)
* @method Elements switchWith($markup)
* @method Elements eq($num)
* @method Elements size()
* @method Elements length()
* @method count()
* @method Elements end($level)
* @method int count()
* @method Elements end($level = 1)
* @method Elements _clone()
* @method Elements replaceWithPHP($code)
* @method Elements replaceWith($content)
* @method Elements replaceAll($selector)
* @method Elements remove($selector)
* @method markup($markup,$callback1,$callback2,$callback3)
* @method markupOuter($callback1,$callback2,$callback3)
* @method html($html,$callback1,$callback2,$callback3)
* @method xml($xml,$callback1,$callback2,$callback3)
* @method htmlOuter($callback1,$callback2,$callback3)
* @method xmlOuter($callback1,$callback2,$callback3)
* @method Elements remove($selector = null)
* @method Elements|string markup($markup = null, $callback1 = null, $callback2 = null, $callback3 = null)
* @method string markupOuter($callback1 = null, $callback2 = null, $callback3 = null)
* @method Elements|string html($html = null, $callback1 = null, $callback2 = null, $callback3 = null)
* @method Elements|string xml($xml = null, $callback1 = null, $callback2 = null, $callback3 = null)
* @method string htmlOuter($callback1 = null, $callback2 = null, $callback3 = null)
* @method string xmlOuter($callback1 = null, $callback2 = null, $callback3 = null)
* @method Elements php($code)
* @method markupPHP($code)
* @method markupOuterPHP()
* @method string markupPHP($code)
* @method string markupOuterPHP()
* @method Elements children($selector)
* @method Elements ancestors($selector)
* @method Elements append($content)
@@ -85,56 +87,52 @@ use phpQueryObject;
* @method Elements after($content)
* @method Elements afterPHP($content)
* @method Elements insertAfter($seletor)
* @method Elements insert($target,$type)
* @method index($subject)
* @method Elements slice($start,$end)
* @method Elements insert($target, $type)
* @method int index($subject)
* @method Elements slice($start, $end = null)
* @method Elements reverse()
* @method text($text,$callback1,$callback2,$callback3)
* @method Elements plugin($class,$file)
* @method extend($class,$file)
* @method Elements _next($selector)
* @method Elements _prev($selector)
* @method Elements prev($selector)
* @method Elements prevAll($selector)
* @method Elements nextAll($selector)
* @method Elements siblings($selector)
* @method Elements not($selector)
* @method Elements add($selector)
* @method Elements parent($selector)
* @method Elements parents($selector)
* @method stack($nodeTypes)
* @method attr($attr,$value)
* @method Elements attrPHP($attr,$code)
* @method Elements|string text($text = null, $callback1 = null, $callback2 = null, $callback3 = null)
* @method Elements plugin($class, $file = null)
* @method Elements _next($selector = null)
* @method Elements _prev($selector = null)
* @method Elements prev($selector = null)
* @method Elements prevAll($selector = null)
* @method Elements nextAll($selector = null)
* @method Elements siblings($selector = null)
* @method Elements not($selector = null)
* @method Elements add($selector = null)
* @method Elements parent($selector = null)
* @method Elements parents($selector = null)
* @method Elements stack($nodeTypes = null)
* @method Elements|string attr($attr = null, $value = null)
* @method Elements attrPHP($attr, $code)
* @method Elements removeAttr($attr)
* @method val($val)
* @method Elements|string val($val = null)
* @method Elements andSelf()
* @method Elements addClass($className)
* @method Elements addClassPHP($className)
* @method hasClass($className)
* @method bool hasClass($className)
* @method Elements removeClass($className)
* @method Elements toggleClass($className)
* @method Elements _empty()
* @method Elements each($callback,$param1,$param2,$param3)
* @method Elements callback($callback,$param1,$param2,$param3)
* @method data($key,$value)
* @method removeData($key)
* @method rewind()
* @method current()
* @method key()
* @method Elements next($cssSelector)
* @method valid()
* @method offsetExists($offset)
* @method offsetGet($offset)
* @method offsetSet($offset,$value)
* @method offsetUnset($offset)
* @method whois($oneNode)
* @method Elements callback($callback, $param1 = null, $param2 = null, $param3 = null)
* @method string data($key, $value = null)
* @method Elements removeData($key)
* @method void rewind()
* @method Elements current()
* @method int key()
* @method Elements next($cssSelector = null)
* @method bool valid()
* @method bool offsetExists($offset)
* @method Elements offsetGet($offset)
* @method void offsetSet($offset, $value)
* @method string whois($oneNode)
* @method Elements dump()
* @method dumpWhois()
* @method dumpLength()
* @method dumpTree($html,$title)
* @method Elements dumpWhois()
* @method Elements dumpLength()
* @method Elements dumpTree($html, $title)
* @method dumpDie()
*/
class Elements
{
/**
@@ -153,50 +151,104 @@ class Elements
public function __get($name)
{
return property_exists($this->elements,$name)?$this->elements->$name:$this->elements->attr($name);
return property_exists($this->elements, $name) ? $this->elements->$name : $this->elements->attr($name);
}
public function __call($name, $arguments)
{
$obj = call_user_func_array([$this->elements,$name],$arguments);
if($obj instanceof phpQueryObject){
$obj = call_user_func_array([$this->elements, $name], $arguments);
if ($obj instanceof phpQueryObject) {
$obj = new self($obj);
}else if(is_string($obj)){
} else if (is_string($obj)) {
$obj = trim($obj);
}
return $obj;
}
/**
* Iterating elements
*
* @param callable $callback
*
* @return $this
*/
public function each(callable $callback)
{
foreach ($this->elements as $key => $element) {
$break = $callback(new self(pq($element)), $key);
if ($break === false) {
break;
}
}
return $this;
}
/**
* Iterating elements
*
* @param $callback
* @return \Illuminate\Support\Collection|\Tightenco\Collect\Support\Collection
*/
public function map($callback)
{
$collection = collect();
$this->elements->each(function($dom) use(& $collection,$callback){
$collection = new Collection();
$this->elements->each(function ($dom) use (& $collection, $callback) {
$collection->push($callback(new self(pq($dom))));
});
return $collection;
}
/**
* Gets the attributes of all the elements
*
* @param string $attr HTML attribute name
* @return \Illuminate\Support\Collection|\Tightenco\Collect\Support\Collection
*/
public function attrs($attr)
{
return $this->map(function($item) use($attr){
return $this->map(function ($item) use ($attr) {
return $item->attr($attr);
});
}
/**
* Gets the text of all the elements
*
* @return \Illuminate\Support\Collection|\Tightenco\Collect\Support\Collection
*/
public function texts()
{
return $this->map(function($item){
return $this->map(function ($item) {
return trim($item->text());
});
}
/**
* Gets the html of all the elements
*
* @return \Illuminate\Support\Collection|\Tightenco\Collect\Support\Collection
*/
public function htmls()
{
return $this->map(function($item){
return $this->map(function ($item) {
return trim($item->html());
});
}
/**
* Gets the htmlOuter of all the elements
*
* @return \Illuminate\Support\Collection|\Tightenco\Collect\Support\Collection
*/
public function htmlOuters()
{
return $this->map(function ($item) {
return trim($item->htmlOuter());
});
}
/**
* @return phpQueryObject
*/

View File

@@ -7,14 +7,18 @@
namespace QL\Dom;
use Illuminate\Support\Collection;
use Tightenco\Collect\Support\Collection;
use phpQuery;
use phpQueryObject;
use QL\QueryList;
use Closure;
class Query
{
protected $html;
/**
* @var \phpQueryObject
*/
protected $document;
protected $rules;
protected $range = null;
@@ -30,35 +34,71 @@ class Query
$this->ql = $ql;
}
public function getHtml()
/**
* @param bool $rel
* @return String
*/
public function getHtml($rel = true)
{
return $this->html;
return $rel ? $this->document->htmlOuter() : $this->html;
}
/**
* @param $html
* @param null $charset
* @return QueryList
*/
public function setHtml($html, $charset = null)
{
$this->html = value($html);
$this->document = phpQuery::newDocumentHTML($this->html,$charset);
$this->destroyDocument();
$this->document = phpQuery::newDocumentHTML($this->html, $charset);
return $this->ql;
}
/**
* Get crawl results
*
* @param Closure|null $callback
* @return Collection|static
*/
public function getData(Closure $callback = null)
{
return is_null($callback) ? $this->data : $this->data->map($callback);
return $this->handleData($this->data, $callback);
}
/**
* @param Collection $data
*/
public function setData(Collection $data)
{
$this->data = $data;
}
/**
* Searches for all elements that match the specified expression.
*
* @param $selector A string containing a selector expression to match elements against.
* @return Elements
*/
public function find($selector)
{
return (new Dom($this->document))->find($selector);
}
/**
* Set crawl rule
*
* $rules = [
* 'rule_name1' => ['selector','HTML attribute | text | html','Tag filter list','callback'],
* 'rule_name2' => ['selector','HTML attribute | text | html','Tag filter list','callback'],
* // ...
* ]
*
* @param array $rules
* @return QueryList
*/
public function rules(array $rules)
{
$this->rules = $rules;
@@ -66,129 +106,180 @@ class Query
}
public function range($range)
/**
* Set the slice area for crawl list
*
* @param $selector
* @return QueryList
*/
public function range($selector)
{
$this->range = $range;
$this->range = $selector;
return $this->ql;
}
/**
* Remove HTML head,try to solve the garbled
*
* @return QueryList
*/
public function removeHead()
{
$html = preg_replace('/<head.+?>.+<\/head>/is','<head></head>',$this->html);
$this->setHtml($html);
$html = preg_replace('/(<head>|<head\s+.+?>).+?<\/head>/is', '<head></head>', $this->html);
$html && $this->setHtml($html);
return $this->ql;
}
/**
* Execute the query rule
*
* @param Closure|null $callback
* @return QueryList
*/
public function query(Closure $callback = null)
{
$this->data = $this->getList();
$callback && $this->data = $this->data->map($callback);
$this->data = $this->handleData($this->data, $callback);
return $this->ql;
}
public function handleData(Collection $data, $callback)
{
if (is_callable($callback)) {
if (empty($this->range)) {
$data = new Collection($callback($data->all(), null));
} else {
$data = $data->map($callback);
}
}
return $data;
}
protected function getList()
{
$data = [];
if (!empty($this->range)) {
$robj = $this->document->find($this->range);
if (empty($this->range)) {
foreach ($this->rules as $key => $reg_value) {
$rule = $this->parseRule($reg_value);
$contentElements = $this->document->find($rule['selector']);
$data[$key] = $this->extractContent($contentElements, $key, $rule);
}
} else {
$rangeElements = $this->document->find($this->range);
$i = 0;
foreach ($robj as $item) {
foreach ($this->rules as $key => $reg_value){
$tags = $reg_value[2] ?? '';
$iobj = pq($item,$this->document)->find($reg_value[0]);
switch ($reg_value[1]) {
case 'text':
$data[$i][$key] = $this->allowTags(pq($iobj)->html(),$tags);
break;
case 'html':
$data[$i][$key] = $this->stripTags(pq($iobj)->html(),$tags);
break;
default:
$data[$i][$key] = pq($iobj)->attr($reg_value[1]);
break;
}
if(isset($reg_value[3])){
$data[$i][$key] = call_user_func($reg_value[3],$data[$i][$key],$key);
}
foreach ($rangeElements as $element) {
foreach ($this->rules as $key => $reg_value) {
$rule = $this->parseRule($reg_value);
$contentElements = pq($element)->find($rule['selector']);
$data[$i][$key] = $this->extractContent($contentElements, $key, $rule);
}
$i++;
}
} else {
foreach ($this->rules as $key => $reg_value){
$tags = $reg_value[2] ?? '';
$lobj = $this->document->find($reg_value[0]);
$i = 0;
foreach ($lobj as $item) {
switch ($reg_value[1]) {
case 'text':
$data[$i][$key] = $this->allowTags(pq($item,$this->document)->html(),$tags);
break;
case 'html':
$data[$i][$key] = $this->stripTags(pq($item,$this->document)->html(),$tags);
break;
default:
$data[$i][$key] = pq($item,$this->document)->attr($reg_value[1]);
break;
}
if(isset($reg_value[3])){
$data[$i][$key] = call_user_func($reg_value[3],$data[$i][$key],$key);
}
$i++;
}
}
}
// phpQuery::$documents = array();
return collect($data);
return new Collection($data);
}
protected function extractContent(phpQueryObject $pqObj, $ruleName, $rule)
{
switch ($rule['attr']) {
case 'text':
$content = $this->allowTags($pqObj->html(), $rule['filter_tags']);
break;
case 'texts':
$content = (new Elements($pqObj))->map(function (Elements $element) use ($rule) {
return $this->allowTags($element->html(), $rule['filter_tags']);
})->all();
break;
case 'html':
$content = $this->stripTags($pqObj->html(), $rule['filter_tags']);
break;
case 'htmls':
$content = (new Elements($pqObj))->map(function (Elements $element) use ($rule) {
return $this->stripTags($element->html(), $rule['filter_tags']);
})->all();
break;
case 'htmlOuter':
$content = $this->stripTags($pqObj->htmlOuter(), $rule['filter_tags']);
break;
case 'htmlOuters':
$content = (new Elements($pqObj))->map(function (Elements $element) use ($rule) {
return $this->stripTags($element->htmlOuter(), $rule['filter_tags']);
})->all();
break;
default:
if(preg_match('/attr\((.+)\)/', $rule['attr'], $arr)) {
$content = $pqObj->attr($arr[1]);
} elseif (preg_match('/attrs\((.+)\)/', $rule['attr'], $arr)) {
$content = (new Elements($pqObj))->attrs($arr[1])->all();
} else {
$content = $pqObj->attr($rule['attr']);
}
break;
}
if (is_callable($rule['handle_callback'])) {
$content = call_user_func($rule['handle_callback'], $content, $ruleName);
}
return $content;
}
protected function parseRule($rule)
{
$result = [];
$result['selector'] = $rule[0];
$result['attr'] = $rule[1];
$result['filter_tags'] = $rule[2] ?? '';
$result['handle_callback'] = $rule[3] ?? null;
return $result;
}
/**
* 去除特定的html标签
* @param string $html
* @param string $tags_str 多个标签名之间用空格隔开
* @param string $html
* @param string $tags_str 多个标签名之间用空格隔开
* @return string
*/
protected function stripTags($html,$tags_str)
protected function stripTags($html, $tags_str)
{
$tagsArr = $this->tag($tags_str);
$html = $this->removeTags($html,$tagsArr[1]);
$html = $this->removeTags($html, $tagsArr[1]);
$p = array();
foreach ($tagsArr[0] as $tag) {
$p[]="/(<(?:\/".$tag."|".$tag.")[^>]*>)/i";
$p[] = "/(<(?:\/" . $tag . "|" . $tag . ")[^>]*>)/i";
}
$html = preg_replace($p,"",trim($html));
$html = preg_replace($p, "", trim($html));
return $html;
}
/**
* 保留特定的html标签
* @param string $html
* @param string $tags_str 多个标签名之间用空格隔开
* @param string $html
* @param string $tags_str 多个标签名之间用空格隔开
* @return string
*/
protected function allowTags($html,$tags_str)
protected function allowTags($html, $tags_str)
{
$tagsArr = $this->tag($tags_str);
$html = $this->removeTags($html,$tagsArr[1]);
$html = $this->removeTags($html, $tagsArr[1]);
$allow = '';
foreach ($tagsArr[0] as $tag) {
$allow .= "<$tag> ";
}
return strip_tags(trim($html),$allow);
return strip_tags(trim($html), $allow);
}
protected function tag($tags_str)
{
$tagArr = preg_split("/\s+/",$tags_str,-1,PREG_SPLIT_NO_EMPTY);
$tags = array(array(),array());
foreach($tagArr as $tag)
{
if(preg_match('/-(.+)/', $tag,$arr))
{
$tagArr = preg_split("/\s+/", $tags_str, -1, PREG_SPLIT_NO_EMPTY);
$tags = array(array(), array());
foreach ($tagArr as $tag) {
if (preg_match('/-(.+)/', $tag, $arr)) {
array_push($tags[1], $arr[1]);
}else{
} else {
array_push($tags[0], $tag);
}
}
@@ -197,17 +288,16 @@ class Query
/**
* 移除特定的html标签
* @param string $html
* @param array $tags 标签数组
* @param string $html
* @param array $tags 标签数组
* @return string
*/
protected function removeTags($html,$tags)
protected function removeTags($html, $tags)
{
$tag_str = '';
if(count($tags))
{
if (count($tags)) {
foreach ($tags as $tag) {
$tag_str .= $tag_str?','.$tag:$tag;
$tag_str .= $tag_str ? ',' . $tag : $tag;
}
// phpQuery::$defaultCharset = $this->inputEncoding?$this->inputEncoding:$this->htmlEncoding;
$doc = phpQuery::newDocumentHTML($html);
@@ -217,4 +307,16 @@ class Query
}
return $html;
}
}
protected function destroyDocument()
{
if ($this->document instanceof phpQueryObject) {
$this->document->unloadDocument();
}
}
public function __destruct()
{
$this->destroyDocument();
}
}

View File

@@ -14,6 +14,7 @@ use Closure;
use QL\Providers\HttpServiceProvider;
use QL\Providers\PluginServiceProvider;
use QL\Providers\SystemServiceProvider;
use Tightenco\Collect\Support\Collection;
class Kernel
{
@@ -34,7 +35,7 @@ class Kernel
public function __construct(QueryList $ql)
{
$this->ql = $ql;
$this->binds = collect();
$this->binds = new Collection();
}
public function bootstrap()

View File

@@ -11,6 +11,7 @@ namespace QL\Providers;
use QL\Contracts\ServiceProviderContract;
use QL\Kernel;
use QL\Services\HttpService;
use QL\Services\MultiRequestService;
class HttpServiceProvider implements ServiceProviderContract
{
@@ -23,5 +24,17 @@ class HttpServiceProvider implements ServiceProviderContract
$kernel->bind('post',function (...$args){
return HttpService::post($this,...$args);
});
$kernel->bind('postJson',function (...$args){
return HttpService::postJson($this,...$args);
});
$kernel->bind('multiGet',function (...$args){
return new MultiRequestService($this,'get',...$args);
});
$kernel->bind('multiPost',function (...$args){
return new MultiRequestService($this,'post',...$args);
});
}
}

View File

@@ -9,6 +9,7 @@ namespace QL\Providers;
use QL\Contracts\ServiceProviderContract;
use QL\Kernel;
use Closure;
class SystemServiceProvider implements ServiceProviderContract
{
@@ -19,5 +20,13 @@ class SystemServiceProvider implements ServiceProviderContract
return $this;
});
$kernel->bind('queryData',function (Closure $callback = null){
return $this->query()->getData($callback)->all();
});
$kernel->bind('pipe',function (Closure $callback = null){
return $callback($this);
});
}
}

View File

@@ -14,14 +14,16 @@
namespace QL;
use phpQuery;
use QL\Dom\Query;
use Illuminate\Support\Collection;
use Tightenco\Collect\Support\Collection;
use Closure;
use QL\Services\MultiRequestService;
/**
* Class QueryList
* @package QL
*
* @method string getHtml()
* @method string getHtml($rel = true)
* @method QueryList setHtml($html)
* @method QueryList html($html)
* @method Dom\Elements find($selector)
@@ -30,17 +32,22 @@ use Closure;
* @method QueryList removeHead()
* @method QueryList query(Closure $callback = null)
* @method Collection getData(Closure $callback = null)
* @method Array queryData(Closure $callback = null)
* @method QueryList setData(Collection $data)
* @method QueryList encoding(string $outputEncoding,string $inputEncoding = null)
* @method QueryList get($url,$args = null,$otherArgs = [])
* @method QueryList post($url,$args = null,$otherArgs = [])
* @method QueryList postJson($url,$args = null,$otherArgs = [])
* @method MultiRequestService multiGet($urls)
* @method MultiRequestService multiPost($urls)
* @method QueryList use($plugins,...$opt)
* @method QueryList pipe(Closure $callback = null)
*/
class QueryList
{
protected $query;
protected $kernel;
protected static $plugins = [];
protected static $instance = null;
/**
* QueryList constructor.
@@ -64,7 +71,7 @@ class QueryList
public static function __callStatic($name, $arguments)
{
$instance = self::getInstance();
$instance = new self();
return $instance->$name(...$arguments);
}
@@ -73,22 +80,50 @@ class QueryList
$this->destruct();
}
/**
* Get the QueryList single instance
*
* @return QueryList
*/
public static function getInstance()
{
$instance = new self();
return $instance;
self::$instance || self::$instance = new self();
return self::$instance;
}
/**
* Get the Config instance
* @return null|Config
*/
public static function config()
{
return Config::getInstance();
}
/**
* Destruction of resources
*/
public function destruct()
{
unset($this->query);
unset($this->kernel);
}
/**
* Destroy all documents
*/
public static function destructDocuments()
{
phpQuery::$documents = [];
}
/**
* Bind a custom method to the QueryList object
*
* @param string $name Invoking the name
* @param Closure $provide Called method
* @return $this
*/
public function bind(string $name,Closure $provide)
{
$this->kernel->bind($name,$provide);

View File

@@ -27,7 +27,8 @@ class HttpService
public static function get(QueryList $ql,$url,$args = null,$otherArgs = [])
{
$otherArgs = array_merge([
'cookies' => self::getCookieJar()
'cookies' => self::getCookieJar(),
'verify' => false
],$otherArgs);
$html = GHttp::get($url,$args,$otherArgs);
$ql->setHtml($html);
@@ -37,10 +38,22 @@ class HttpService
public static function post(QueryList $ql,$url,$args = null,$otherArgs = [])
{
$otherArgs = array_merge([
'cookies' => self::getCookieJar()
'cookies' => self::getCookieJar(),
'verify' => false
],$otherArgs);
$html = GHttp::post($url,$args,$otherArgs);
$ql->setHtml($html);
return $ql;
}
public static function postJson(QueryList $ql,$url,$args = null,$otherArgs = [])
{
$otherArgs = array_merge([
'cookies' => self::getCookieJar(),
'verify' => false
],$otherArgs);
$html = GHttp::postJson($url,$args,$otherArgs);
$ql->setHtml($html);
return $ql;
}
}

View File

@@ -0,0 +1,66 @@
<?php
/**
* Created by PhpStorm.
* User: Jaeger <JaegerCode@gmail.com>
* Date: 18/12/10
* Time: 下午7:05
*/
namespace QL\Services;
use Jaeger\GHttp;
use Closure;
use GuzzleHttp\Psr7\Response;
use QL\QueryList;
use GuzzleHttp\Exception\RequestException;
/**
* Class MultiRequestService
* @package QL\Services
*
* @method MultiRequestService withHeaders($headers)
* @method MultiRequestService withOptions($options)
* @method MultiRequestService concurrency($concurrency)
*/
class MultiRequestService
{
protected $ql;
protected $multiRequest;
protected $method;
public function __construct(QueryList $ql,$method,$urls)
{
$this->ql = $ql;
$this->method = $method;
$this->multiRequest = GHttp::multiRequest($urls);
}
public function __call($name, $arguments)
{
$this->multiRequest = $this->multiRequest->$name(...$arguments);
return $this;
}
public function success(Closure $success)
{
$this->multiRequest = $this->multiRequest->success(function(Response $response, $index) use($success){
$this->ql->setHtml((String)$response->getBody());
$success($this->ql,$response, $index);
});
return $this;
}
public function error(Closure $error)
{
$this->multiRequest = $this->multiRequest->error(function(RequestException $reason, $index) use($error){
$error($this->ql,$reason, $index);
});
return $this;
}
public function send()
{
$this->multiRequest->{$this->method}();
}
}

71
tests/Dom/FindTest.php Normal file
View File

@@ -0,0 +1,71 @@
<?php
/**
* Created by PhpStorm.
* User: x
* Date: 2018/12/10
* Time: 12:46 AM
*/
namespace Tests\Dom;
use QL\QueryList;
use Tests\TestCaseBase;
class FindTest extends TestCaseBase
{
protected $html;
protected $ql;
protected function setUp(): void
{
$this->html = $this->getSnippet('snippet-1');
$this->ql = QueryList::html($this->html);
}
/**
* @test
*/
public function find_first_dom_attr()
{
$img = [];
$img[] = $this->ql->find('img')->attr('src');
$img[] = $this->ql->find('img')->src;
$img[] = $this->ql->find('img:eq(0)')->src;
$img[] = $this->ql->find('img')->eq(0)->src;
$alt = $this->ql->find('img')->alt;
$abc = $this->ql->find('img')->abc;
$this->assertCount(1,array_unique($img));
$this->assertEquals($alt,'这是图片');
$this->assertEquals($abc,'这是一个自定义属性');
}
/**
* @test
*/
public function find_second_dom_attr()
{
$img2 = [];
$img2[] = $this->ql->find('img')->eq(1)->alt;
$img2[] = $this->ql->find('img:eq(1)')->alt;
$img2[] = $this->ql->find('.second_pic')->alt;
$this->assertCount(1,array_unique($img2));
}
/**
* @test
*/
public function find_dom_all_attr()
{
$imgAttr = $this->ql->find('img:eq(0)')->attr('*');
$linkAttr = $this->ql->find('a:eq(1)')->attr('*');
$this->assertCount(3,$imgAttr);
$this->assertCount(1,$linkAttr);
}
}

43
tests/Dom/RulesTest.php Normal file
View File

@@ -0,0 +1,43 @@
<?php
/**
* Created by PhpStorm.
* User: Jaeger <JaegerCode@gmail.com>
* Date: 18/12/12
* Time: 下午12:25
*/
namespace Tests\Dom;
use QL\QueryList;
use Tests\TestCaseBase;
use Tightenco\Collect\Support\Collection;
class RulesTest extends TestCaseBase
{
protected $html;
protected $ql;
protected function setUp(): void
{
$this->html = $this->getSnippet('snippet-2');
$this->ql = QueryList::html($this->html);
}
/**
* @test
*/
public function get_data_by_rules()
{
$rules = [
'a' => ['a','text'],
'img_src' => ['img','src'],
'img_alt' => ['img','alt']
];
$range = 'ul>li';
$data = QueryList::rules($rules)->range($range)->html($this->html)->query()->getData();
$this->assertInstanceOf(Collection::class,$data);
$this->assertCount(3,$data);
$this->assertEquals('http://querylist.com/2.jpg',$data[1]['img_src']);
}
}

103
tests/Feature/HttpTest.php Normal file
View File

@@ -0,0 +1,103 @@
<?php
/**
* Created by PhpStorm.
* User: x
* Date: 2018/12/10
* Time: 12:35 AM
*/
namespace Tests\Feature;
use GuzzleHttp\Handler\MockHandler;
use GuzzleHttp\Psr7\Response;
use QL\QueryList;
use Tests\TestCaseBase;
class HttpTest extends TestCaseBase
{
protected $urls;
protected function setUp(): void
{
$this->urls = [
'http://httpbin.org/get?name=php',
'http://httpbin.org/get?name=golang',
'http://httpbin.org/get?name=c++',
'http://httpbin.org/get?name=java'
];
}
/**
* @test
*/
public function can_post_json_data()
{
$mock = new MockHandler([new Response()]);
$data = [
'name' => 'foo'
];
QueryList::postJson('http://foo.com',$data,[
'handler' => $mock
]);
$this->assertEquals((string)$mock->getLastRequest()->getBody(),json_encode($data));
}
/**
* @test
*/
public function concurrent_requests_base_use()
{
$urls = $this->urls;
QueryList::getInstance()
->multiGet($urls)
->success(function(QueryList $ql,Response $response, $index) use($urls){
$body = json_decode((string)$response->getBody(),true);
$this->assertEquals($urls[$index],$body['url']);
})->send();
}
/**
* @test
*/
public function concurrent_requests_advanced_use()
{
$ua = 'QueryList/4.0';
$errorUrl = 'http://web-site-not-exist.com';
$urls = array_merge($this->urls,[$errorUrl]);
QueryList::rules([])
->multiGet($urls)
->concurrency(2)
->withOptions([
'timeout' => 60
])
->withHeaders([
'User-Agent' => $ua
])
->success(function (QueryList $ql, Response $response, $index) use($ua){
$body = json_decode((string)$response->getBody(),true);
$this->assertEquals($ua,$body['headers']['User-Agent']);
})
->error(function (QueryList $ql, $reason, $index) use($urls,$errorUrl){
$this->assertEquals($urls[$index],$errorUrl);
})
->send();
}
/**
* @test
*/
public function request_with_cache()
{
$url = $this->urls[0];
$data = QueryList::get($url,null,[
'cache' => sys_get_temp_dir(),
'cache_ttl' => 600
])->getHtml();
$data = json_decode($data,true);
$this->assertEquals($url,$data['url']);
}
}

View File

@@ -0,0 +1,48 @@
<?php
/**
* Created by PhpStorm.
* User: x
* Date: 2018/12/9
* Time: 11:10 PM
*/
namespace Tests\Feature;
use QL\QueryList;
use Tests\TestCaseBase;
class InstanceTest extends TestCaseBase
{
protected $html;
protected function setUp(): void
{
$this->html = $this->getSnippet('snippet-1');
}
/**
* @test
*/
public function singleton_instance_mode()
{
$ql = QueryList::getInstance()->html($this->html);
$ql2 = QueryList::getInstance();
$this->assertEquals($ql->getHtml(),$ql2->getHtml());
}
/**
* @test
*/
public function get_new_object()
{
$ql = (new QueryList())->html($this->html);
$ql2 = (new QueryList())->html('');
$this->assertNotEquals($ql->getHtml(),$ql2->getHtml());
$ql = QueryList::range('')->html($this->html);
$ql2 = QueryList::range('')->html('');
$this->assertNotEquals($ql->getHtml(),$ql2->getHtml());
}
}

View File

@@ -0,0 +1,36 @@
<?php
/**
* Created by PhpStorm.
* User: x
* Date: 2018/12/10
* Time: 1:14 AM
*/
namespace Tests\Feature;
use QL\QueryList;
use Tests\TestCaseBase;
class MethodTest extends TestCaseBase
{
protected $html;
protected function setUp(): void
{
$this->html = $this->getSnippet('snippet-1');
}
/**
* @test
*/
public function pipe()
{
$html = $this->html;
$qlHtml = QueryList::pipe(function(QueryList $ql) use($html){
$ql->setHtml($html);
return $ql;
})->getHtml(false);
$this->assertEquals($html,$qlHtml);
}
}

20
tests/TestCaseBase.php Normal file
View File

@@ -0,0 +1,20 @@
<?php
/**
* Created by PhpStorm.
* User: x
* Date: 2018/12/9
* Time: 11:43 PM
*/
namespace Tests;
use PHPUnit\Framework\TestCase;
class TestCaseBase extends TestCase
{
public function getSnippet($name)
{
return file_get_contents(__DIR__.'/assets/'.$name.'.html');
}
}

View File

@@ -0,0 +1,9 @@
<div id="one">
<div class="two">
<a href="http://querylist.cc">QueryList官网</a>
<img src="http://querylist.com/1.jpg" alt="这是图片" abc="这是一个自定义属性">
<img class="second_pic" src="http://querylist.com/2.jpg" alt="这是图片2">
<a href="http://doc.querylist.cc">QueryList文档</a>
</div>
<span>其它的<b>一些</b>文本</span>
</div>

View File

@@ -0,0 +1,16 @@
<div id="one">
<ul>
<li>
<a href="http://querylist.cc">QueryList官网</a>
<img src="http://querylist.com/1.jpg" alt="这是图片1" abc="这是一个自定义属性1">
</li>
<li>
<a href="http://v3.querylist.cc">QueryList V3文档</a>
<img src="http://querylist.com/2.jpg" alt="这是图片2" abc="这是一个自定义属性2">
</li>
<li>
<a href="http://v4.querylist.cc">QueryList V4文档</a>
<img src="http://querylist.com/3.jpg" alt="这是图片3" abc="这是一个自定义属性3">
</li>
</ul>
</div>

5
tests/bootstrap.php Normal file
View File

@@ -0,0 +1,5 @@
<?php
set_time_limit(0);
require __DIR__.'/../vendor/autoload.php';