120 Commits
V3.1 ... master

Author SHA1 Message Date
Jaeger(黄杰)
894fb4344e Merge pull request #145 from maxiaozhi/master
正则匹配成功时才替换掉html
2021-08-08 13:04:55 +08:00
lion
e4fc716acd 正则匹配成功时才替换掉html 2021-07-18 23:37:35 +08:00
Jaeger(黄杰)
39dc0ca9c6 Merge pull request #143 from maxiaozhi/patch-1
Fix the matching exception
2021-07-05 14:07:58 +08:00
maxiaozhi
ef0a2efd4f Fix the matching exception
Fix the matching exception when the page contains multiple tags prefixed with head (for example: < head >, < header >)
2021-07-05 13:51:24 +08:00
huangjie
5953daac54 update collect 2020-12-14 10:39:28 +08:00
huangjie
465c6aefc7 update collect 2020-09-27 17:41:44 +08:00
Jaeger(黄杰)
92cb319d44 Update README-ZH.md 2020-07-18 13:06:29 +08:00
Jaeger(黄杰)
cbf3e0fcad Update README.md 2020-07-18 13:05:59 +08:00
Jaeger(黄杰)
cfa2d94a79 Update FUNDING.yml 2020-07-17 13:20:49 +08:00
Jaeger(黄杰)
47a444bf9e Create FUNDING.yml 2020-07-17 13:08:44 +08:00
Jaeger
85903fa9b5 feat: rules add attrs 2020-04-03 20:16:00 +08:00
Jaeger(黄杰)
e527c637c7 Merge pull request #110 from jae-jae/develop
replace collect()
2020-04-03 04:55:27 -05:00
Jaeger
f0a9798925 replace collect() 2020-04-03 17:33:32 +08:00
Jaeger
faea883c6f fix: data callback 2020-04-01 22:03:50 +08:00
Jaeger
c16826a573 updaed composer dependency 2020-03-23 18:15:04 +08:00
Jaeger
1492751f98 feat: optimization getHtml() 2020-03-22 17:19:57 +08:00
Jaeger
b7954b9aef fix: memory overflow 2020-03-20 13:26:40 +08:00
Jaeger
b3d84cf057 feat: modify the each function of class elements 2020-03-15 14:17:18 +08:00
Jaeger
52bbdeae14 Merge branch 'master' of github.com:jae-jae/QueryList into develop 2020-03-15 14:07:52 +08:00
Jaeger(黄杰)
25b2dbdc86 Merge pull request #105 from edwinhuish/add-each-function-same-as-collection
添加 each function 并和 Collection 保持一致,返回 false 时中断循环。
2020-03-15 01:07:22 -05:00
Jaeger
02c2b125d8 feat: elements class add htmlOuters function 2020-03-15 13:58:00 +08:00
Jaeger
fc8b701ef2 feat: optimize range results 2020-03-15 13:45:00 +08:00
Jaeger
75e436c73f feat: merge master 2020-03-15 11:30:35 +08:00
Jaeger(黄杰)
aa90e5a21d Merge pull request #106 from edwinhuish/destroy-old-phpquey-object-when-setHtml
destroy old phpquery object when setHtml
2020-03-14 22:28:13 -05:00
Jaeger
dd9af6881d feat: rules add texts and htmls attribute 2020-03-13 21:42:25 +08:00
Jaeger
b07d4bfc74 feat: rules add texts and htmls attribute 2020-03-13 21:39:42 +08:00
Edwin Xu
8c1614c4c3 destroy old phpquery object when setHtml 2020-03-13 16:08:55 +08:00
Jaeger
b387ef5bb0 feat: rules add htmlOuter attribute 2020-03-13 15:16:44 +08:00
Edwin Xu
67f0052c5d 添加 each function 并和 Collection 保持一致,返回 false 时中断循环。 2020-03-13 14:20:37 +08:00
Jaeger
7c86f82527 fix: optimize memory usage 2020-03-13 13:49:36 +08:00
Jaeger(黄杰)
6ee6a26aee Merge pull request #102 from edwinhuish/auto-destroy-phpquery-document
destroy phpquery document object when destruct Query class
2020-03-11 10:29:31 -05:00
Jaeger(黄杰)
116f19da65 Merge pull request #104 from edwinhuish/add-phpdoc
fix phpdoc
2020-03-11 10:20:22 -05:00
Edwin Xu
67cbd0f473 修复phpdoc 2020-03-10 21:36:55 +08:00
Edwin Xu
3eb26451c6 修复phpdoc 2020-03-10 21:03:25 +08:00
Edwin Xu
a76ecb4258 destroy phpquery document object when destruct Query class 2020-03-05 22:27:27 +08:00
Jaeger
46f564bc8b Updated phpQuery 2019-02-22 15:33:54 +08:00
Jaeger
df9e3bbf19 test htpp cache 2018-12-12 15:29:31 +08:00
Jaeger
0c85eed7ef add multiGet and multiPost 2018-12-11 17:52:41 +08:00
Jaeger
df521923ac Concurrent requests 2018-12-11 00:00:17 +08:00
Jaeger
a779ef71f3 add MultiRequest 2018-12-10 19:23:15 +08:00
Jaeger
c32736bd9e add pipe 2018-12-10 01:27:48 +08:00
Jaeger
661bc3168d add phpunit 2018-12-10 00:13:16 +08:00
Jaeger
6d182ff061 remove instance 2018-12-07 00:35:58 +08:00
Jaeger
1c2e3f4adf add queryData() 2018-10-15 18:52:12 +08:00
Jaeger
1d73895981 single instance 2017-12-15 11:05:32 +08:00
Jaeger
03e6a955bf add https verify false 2017-12-14 10:31:47 +08:00
Jaeger
72a7543da3 fix laravel conflict bug 2017-11-15 10:46:51 +08:00
Jaeger
9d04003d73 fix laravel conflict bug 2017-11-15 10:43:28 +08:00
Jaeger
31ec950cdc ok 2017-10-09 11:27:08 +08:00
Jaeger
18bc6daea4 ok 2017-10-09 02:44:07 +08:00
Jaeger
f2c6ce7385 add comments 2017-10-09 01:48:56 +08:00
Jaeger
c0ed870dc8 ok 2017-10-08 23:01:22 +08:00
Jaeger
a4d0087e47 update README 2017-10-08 22:48:06 +08:00
Jaeger
a0f7b9aa3e ok 2017-10-02 10:30:24 +08:00
Jaeger
d812c47ede update 2017-10-01 23:37:09 +08:00
Jaeger
47c0f37233 update README 2017-10-01 12:49:01 +08:00
Jaeger
967ef10f23 ok 2017-10-01 01:14:32 +08:00
Jaeger
c82eb3c557 ok 2017-10-01 01:13:39 +08:00
Jaeger
f68cc2e218 add EN README 2017-10-01 01:11:47 +08:00
Jaeger
684e52c70e ok 2017-10-01 00:23:34 +08:00
Jaeger
777d837f18 update README 2017-09-30 21:49:07 +08:00
Jaeger
6e9a202ac2 update README 2017-09-30 21:46:31 +08:00
Jaeger
e885eece26 ok 2017-09-30 12:09:51 +08:00
Jaeger
aeeec5367e ok 2017-09-30 12:04:27 +08:00
Jaeger
c42a7b1766 ok 2017-09-30 12:02:25 +08:00
Jaeger
a3a830a744 add logo 2017-09-30 12:01:15 +08:00
Jaeger
7381ec21d3 update REMADE 2017-09-30 11:32:09 +08:00
Jaeger
95102a5ce2 ok 2017-09-30 01:41:09 +08:00
Jaeger
520195c929 update COMMUNITY 2017-09-30 01:39:16 +08:00
Jaeger
75799decc3 add COMMUNITY 2017-09-30 01:12:00 +08:00
Jaeger
33c574cdb9 ok 2017-09-29 23:47:35 +08:00
Jaeger
47a777789b ok 2017-09-29 18:43:24 +08:00
Jaeger
ad8ce44572 update README 2017-09-29 15:07:25 +08:00
Jaeger
1c54d63993 update README 2017-09-29 15:05:33 +08:00
Jaeger
59d48911fd update README 2017-09-29 14:59:16 +08:00
Jaeger
5ed0921d17 ok 2017-09-29 12:18:23 +08:00
Jaeger
fcdc5a16db ok 2017-09-29 12:16:50 +08:00
Jaeger
a8a438edbe update README 2017-09-29 00:36:02 +08:00
Jaeger
bd58352117 update http plugin 2017-09-26 17:54:29 +08:00
Jaeger
c3f8a48357 update config 2017-09-25 14:36:21 +08:00
Jaeger
006e24a117 fix bug 2017-09-25 14:15:26 +08:00
Jaeger
042993311f add getData 2017-09-24 15:11:44 +08:00
Jaeger
b6c21b653a V4 is coming 2017-09-22 22:37:25 +08:00
Jaeger
5422168c98 add plugin 2017-09-22 19:09:43 +08:00
Jaeger
624f071a0d fix bug 2017-09-22 12:05:29 +08:00
Jaeger
042c10cdea add Http service 2017-09-22 02:38:46 +08:00
Jaeger
2013e4d2b0 add Query 2017-09-22 01:51:46 +08:00
Jaeger
ad9b493fc0 add encoding service 2017-09-21 13:12:20 +08:00
Jaeger
43d8f71678 add service provider 2017-09-21 02:20:28 +08:00
Jaeger
02fe5a7f9e ok 2017-09-21 01:44:03 +08:00
Jaeger
8bd07f5fbb ok 2017-09-20 01:12:54 +08:00
Jaeger
02c4c93ee5 add query() 2017-09-19 19:06:16 +08:00
Jaeger
0fafaafa7b update README 2017-09-19 18:00:33 +08:00
Jaeger
fe749f08c2 add Dom 2017-09-19 17:48:48 +08:00
Jaeger
e3576ce407 start V4 2017-09-19 02:33:38 +08:00
Jaeger
1a7864dcf8 V3.2.1 2017-06-09 12:25:07 +08:00
Jaeger
5cc049992b V3.1.3 2017-06-09 12:23:43 +08:00
Jaeger
967f2d95cd fix bug 2017-06-09 12:21:49 +08:00
Jaeger
7f6b6e279e update composer 2017-06-09 12:13:17 +08:00
Jaeger
198385e336 Merge pull request #4 from baijunyao/master
Using version 0.9.7 of phpQuery-single
2017-06-09 12:10:55 +08:00
白俊遥
26d6cf5e43 Using version 0.9.7 of phpQuery-single
使用0.9.7版本的phpQuery-single以解决phpQuery.php文件中因写错hltml造成的错误;
2017-06-08 15:07:56 +08:00
Jaeger
700d56db49 Merge branch 'feature/log' into develop 2017-04-20 13:53:32 +08:00
Jaeger
1691ddf3ee log ok 2017-04-20 13:53:03 +08:00
Jaeger
cbae16c6a4 添加日志功能 2017-04-19 18:30:22 +08:00
Jaeger
66c4ef8c4f Merge pull request #2 from bryant1410/master
Fix broken headings in Markdown files
2017-04-17 14:38:35 +08:00
Jaeger
330c71778f Merge pull request #1 from han8gui/master
添加ua
2017-04-17 14:37:14 +08:00
Santiago Castro
1185ad399f Fix broken Markdown headings 2017-04-16 21:15:10 -03:00
han8gui
b3290d2484 添加ua 2017-02-09 10:51:09 +08:00
Jaeger
8e4cf456f2 update readme 2016-12-22 16:42:44 +08:00
Jaeger
f006e751ef update readme 2016-12-22 16:39:48 +08:00
Jaeger
64884ee72f update readme 2016-12-22 16:34:17 +08:00
Jaeger
777738adc3 update readme 2016-12-22 16:30:30 +08:00
JAE
4a003e5490 update readme 2016-01-14 11:29:09 +08:00
Jaeger
fbea1aaa94 update 2016-01-06 15:56:03 +08:00
x
c63ea6421c merge 2015-12-29 21:52:11 +08:00
x
ba6e6fb4c8 Merge branch 'dev' 2015-12-29 21:49:35 +08:00
x
8bbb3f3171 update var name 2015-12-29 21:45:55 +08:00
JAE
2b0c62489f update composer 2015-12-29 09:41:14 +08:00
JAE
6935f4b178 Merge branch 'master' of git.oschina.net:jae/QueryList 2015-12-28 15:45:44 +08:00
x
5ac4bfe0d0 update readme 2015-12-20 23:56:40 +08:00
34 changed files with 2286 additions and 442 deletions

12
.github/FUNDING.yml vendored Normal file
View File

@@ -0,0 +1,12 @@
# These are supported funding model platforms
github: # Replace with up to 4 GitHub Sponsors-enabled usernames e.g., [user1, user2]
patreon: # Replace with a single Patreon username
open_collective: querylist # Replace with a single Open Collective username
ko_fi: # Replace with a single Ko-fi username
tidelift: # Replace with a single Tidelift platform-name/package-name e.g., npm/babel
community_bridge: # Replace with a single Community Bridge project-name e.g., cloud-foundry
liberapay: # Replace with a single Liberapay username
issuehunt: # Replace with a single IssueHunt username
otechie: # Replace with a single Otechie username
custom: # Replace with up to 4 custom sponsorship URLs e.g., ['link1', 'link2']

4
.gitignore vendored
View File

@@ -1,3 +1,5 @@
/vendor/
.idea/
composer.lock
composer.lock
.DS_Store
*.cache

View File

@@ -1,425 +0,0 @@
<?php
namespace QL;
use phpQuery,Exception,ReflectionClass;
/**
* QueryList
*
* 一个基于phpQuery的通用列表采集类
*
* @author Jaeger
* @email 734708094@qq.com
* @link http://git.oschina.net/jae/QueryList
* @version 3.1.0
*
* @example
*
//获取CSDN移动开发栏目下的文章列表标题
$hj = QueryList::Query('http://mobile.csdn.net/',array("title"=>array('.unit h1','text')));
print_r($hj->data);
//回调函数1
function callfun1($content,$key)
{
return '回调函数1'.$key.'-'.$content;
}
class HJ{
//回调函数2
static public function callfun2($content,$key)
{
return '回调函数2'.$key.'-'.$content;
}
}
//获取CSDN文章页下面的文章标题和内容
$url = 'http://www.csdn.net/article/2014-06-05/2820091-build-or-buy-a-mobile-game-backend';
$reg = array(
'title'=>array('h1','text','','callfun1'), //获取纯文本格式的标题,并调用回调函数1
'summary'=>array('.summary','text','-input strong'), //获取纯文本的文章摘要但保strong标签并去除input标签
'content'=>array('.news_content','html','div a -.copyright'), //获取html格式的文章内容但过滤掉div和a标签,去除类名为copyright的元素
'callback'=>array('HJ','callfun2') //调用回调函数2作为全局回调函数
);
$rang = '.left';
$hj = QueryList::Query($url,$reg,$rang);
print_r($hj->data);
//继续获取右边相关热门文章列表的标题以及链接地址
$hj->setQuery(array('title'=>array('','text'),'url'=>array('a','href')),'#con_two_2 li');
//输出数据
echo $hj->getData();
*/
class QueryList
{
private $regArr;
public $data;
private $regRange;
public $html;
private $pqHtml;
private $outputEncoding = false;
private $inputEncoding = false;
private $htmlEncoding;
public static $instances;
public function __construct() {
}
/**
* 静态方法,访问入口
* @param string $page 要抓取的网页URL地址(支持https);或者是html源代码
* @param array $regArr 【选择器数组】说明格式array("名称"=>array("选择器","类型"[,"标签过滤列表"][,"回调函数"]),.......[,"callback"=>"全局回调函数"]);
* 【选择器】说明:可以为任意的jQuery选择器语法
* 【类型】说明:值 "text" ,"html" ,"HTML标签属性" ,
* 【标签过滤列表】:可选,当标签名前面添加减号(-)时此时标签可以为任意的元素选择器表示移除该标签以及标签内容否则当【类型】值为text时表示需要保留的HTML标签为html时表示要过滤掉的HTML标签
* 【回调函数】/【全局回调函数】:可选,字符串(函数名) 或 数组array("类名","类的静态方法")),回调函数应有俩个参数,第一个参数是选择到的内容,第二个参数是选择器数组下标,回调函数会覆盖全局回调函数
*
* @param string $regRange 【块选择器】:指 先按照规则 选出 几个大块 ,然后再分别再在块里面 进行相关的选择
* @param string $outputEncoding【输出编码格式】指要以什么编码输出(UTF-8,GB2312,.....),防止出现乱码,如果设置为 假值 则不改变原字符串编码
* @param string $inputEncoding 【输入编码格式】明确指定输入的页面编码格式(UTF-8,GB2312,.....),防止出现乱码,如果设置为 假值 则自动识别
* @param bool|false $removeHead 【是否移除页面头部区域】 乱码终极解决方案
* @return mixed
*/
public static function Query($page,array $regArr, $regRange = '', $outputEncoding = null, $inputEncoding = null,$removeHead = false)
{
return self::getInstance()->_query($page, $regArr, $regRange, $outputEncoding, $inputEncoding,$removeHead);
}
/**
* 运行QueryList扩展
* @param $class
* @param array $args
* @return mixed
* @throws Exception
*/
public static function run($class,$args = array())
{
$extension = self::getInstance("QL\\Ext\\{$class}");
return $extension->run($args);
}
/**
* 获取任意实例
* @return mixed
* @throws Exception
*/
public static function getInstance()
{
$args = func_get_args();
count($args) || $args = array(self::class);
$key = md5(serialize($args));
$className = array_shift($args);
if(!class_exists($className)) {
throw new Exception("no class {$className}");
}
if(!isset(self::$instances[$key])) {
$rc = new ReflectionClass($className);
self::$instances[$key] = $rc->newInstanceArgs($args);
}
return self::$instances[$key];
}
/**
* 获取目标页面源码(主要用于调试)
* @param bool|true $rel
* @return string
*/
public function getHtml($rel = true)
{
return $rel?$this->qpHtml:$this->html;
}
/**
* 获取采集结果数据
* @param callback $callback
* @return array
*/
public function getData($callback = null)
{
if(is_callable($callback)){
return array_map($callback,$this->data);
}
return $this->data;
}
/**
* 重新设置选择器
* @param $regArr
* @param string $regRange
* @param string $outputEncoding
* @param string $inputEncoding
* @param bool|false $removeHead
* @return QueryList
*/
public function setQuery(array $regArr, $regRange = '',$outputEncoding = null, $inputEncoding = null,$removeHead = false)
{
return $this->_query($this->html,$regArr, $regRange, $outputEncoding, $inputEncoding,$removeHead);
}
private function _query($page,array $regArr, $regRange, $outputEncoding, $inputEncoding,$removeHead)
{
$this->data = array();
$this->html = $this->_isURL($page)?$this->_request($page):$page;
$outputEncoding && $this->outputEncoding = $outputEncoding;
$inputEncoding && $this->inputEncoding = $inputEncoding;
$removeHead && $this->html = $this->_removeHead($this->html);
$this->pqHtml = '';
if(empty($this->html)){
trigger_error("The received content is empty!",E_USER_NOTICE);
}
//获取编码格式
$this->htmlEncoding = $this->inputEncoding?$this->inputEncoding:$this->_getEncode($this->html);
// $this->html = $this->_removeTags($this->html,array('script','style'));
$this->regArr = $regArr;
$this->regRange = $regRange;
$this->_getList();
return $this;
}
private function _getList()
{
$this->inputEncoding && phpQuery::$defaultCharset = $this->inputEncoding;
$document = phpQuery::newDocumentHTML($this->html);
$this->qpHtml = $document->htmlOuter();
if (!empty($this->regRange)) {
$robj = pq($document)->find($this->regRange);
$i = 0;
foreach ($robj as $item) {
while (list($key, $reg_value) = each($this->regArr)) {
if($key=='callback')continue;
$tags = isset($reg_value[2])?$reg_value[2]:'';
$iobj = pq($item)->find($reg_value[0]);
switch ($reg_value[1]) {
case 'text':
$this->data[$i][$key] = $this->_allowTags(pq($iobj)->html(),$tags);
break;
case 'html':
$this->data[$i][$key] = $this->_stripTags(pq($iobj)->html(),$tags);
break;
default:
$this->data[$i][$key] = pq($iobj)->attr($reg_value[1]);
break;
}
if(isset($reg_value[3])){
$this->data[$i][$key] = call_user_func($reg_value[3],$this->data[$i][$key],$key);
}else if(isset($this->regArr['callback'])){
$this->data[$i][$key] = call_user_func($this->regArr['callback'],$this->data[$i][$key],$key);
}
}
//重置数组指针
reset($this->regArr);
$i++;
}
} else {
while (list($key, $reg_value) = each($this->regArr)) {
if($key=='callback')continue;
$document = phpQuery::newDocumentHTML($this->html);
$tags = isset($reg_value[2])?$reg_value[2]:'';
$lobj = pq($document)->find($reg_value[0]);
$i = 0;
foreach ($lobj as $item) {
switch ($reg_value[1]) {
case 'text':
$this->data[$i][$key] = $this->_allowTags(pq($item)->html(),$tags);
break;
case 'html':
$this->data[$i][$key] = $this->_stripTags(pq($item)->html(),$tags);
break;
default:
$this->data[$i][$key] = pq($item)->attr($reg_value[1]);
break;
}
if(isset($reg_value[3])){
$this->data[$i][$key] = call_user_func($reg_value[3],$this->data[$i][$key],$key);
}else if(isset($this->regArr['callback'])){
$this->data[$i][$key] = call_user_func($this->regArr['callback'],$this->data[$i][$key],$key);
}
$i++;
}
}
}
if ($this->outputEncoding) {
//编码转换
$this->data = $this->_arrayConvertEncoding($this->data, $this->outputEncoding, $this->htmlEncoding);
}
phpQuery::$documents = array();
}
/**
* URL请求
* @param $url
* @return string
*/
private function _request($url)
{
if(function_exists('curl_init')){
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false);
curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, false);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);
curl_setopt($ch, CURLOPT_AUTOREFERER, true);
curl_setopt($ch, CURLOPT_REFERER, $url);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
$result = curl_exec($ch);
curl_close($ch);
}elseif(version_compare(PHP_VERSION, '5.0.0')>=0){
$opts = array(
'http' => array(
'header' => "Referer:{$url}"
)
);
$result = file_get_contents($url,false,stream_context_create($opts));
}else{
$result = file_get_contents($url);
}
return $result;
}
/**
* 移除页面head区域代码
* @param $html
* @return mixed
*/
private function _removeHead($html)
{
return preg_replace('/<head.+?>.+<\/head>/is','<head></head>',$html);
}
/**
* 获取文件编码
* @param $string
* @return string
*/
private function _getEncode($string)
{
return mb_detect_encoding($string, array('ASCII', 'GB2312', 'GBK', 'UTF-8'));
}
/**
* 转换数组值的编码格式
* @param array $arr
* @param string $toEncoding
* @param string $fromEncoding
* @return array
*/
private function _arrayConvertEncoding($arr, $toEncoding, $fromEncoding)
{
eval('$arr = '.iconv($fromEncoding, $toEncoding.'//IGNORE', var_export($arr,TRUE)).';');
return $arr;
}
/**
* 简单的判断一下参数是否为一个URL链接
* @param string $str
* @return boolean
*/
private function _isURL($str)
{
if (preg_match('/^http(s)?:\\/\\/.+/', $str)) {
return true;
}
return false;
}
/**
* 去除特定的html标签
* @param string $html
* @param string $tags_str 多个标签名之间用空格隔开
* @return string
*/
private function _stripTags($html,$tags_str)
{
$tagsArr = $this->_tag($tags_str);
$html = $this->_removeTags($html,$tagsArr[1]);
$p = array();
foreach ($tagsArr[0] as $tag) {
$p[]="/(<(?:\/".$tag."|".$tag.")[^>]*>)/i";
}
$html = preg_replace($p,"",trim($html));
return $html;
}
/**
* 保留特定的html标签
* @param string $html
* @param string $tags_str 多个标签名之间用空格隔开
* @return string
*/
private function _allowTags($html,$tags_str)
{
$tagsArr = $this->_tag($tags_str);
$html = $this->_removeTags($html,$tagsArr[1]);
$allow = '';
foreach ($tagsArr[0] as $tag) {
$allow .= "<$tag> ";
}
return strip_tags(trim($html),$allow);
}
private function _tag($tags_str)
{
$tagArr = preg_split("/\s+/",$tags_str,-1,PREG_SPLIT_NO_EMPTY);
$tags = array(array(),array());
foreach($tagArr as $tag)
{
if(preg_match('/-(.+)/', $tag,$arr))
{
array_push($tags[1], $arr[1]);
}else{
array_push($tags[0], $tag);
}
}
return $tags;
}
/**
* 移除特定的html标签
* @param string $html
* @param array $tags 标签数组
* @return string
*/
private function _removeTags($html,$tags)
{
$tag_str = '';
if(count($tags))
{
foreach ($tags as $tag) {
$tag_str .= $tag_str?','.$tag:$tag;
}
phpQuery::$defaultCharset = $this->inputEncoding?$this->inputEncoding:$this->htmlEncoding;
$doc = phpQuery::newDocumentHTML($html);
pq($doc)->find($tag_str)->remove();
$html = pq($doc)->htmlOuter();
$doc->unloadDocument();
}
return $html;
}
}
/*
class Autoload
{
public static function load($className)
{
$files = array(
sprintf('%s/extensions/%s.php',__DIR__,$className),
sprintf('%s/extensions/vendors/%s.php',__DIR__,$className)
);
foreach ($files as $file) {
if(is_file($file)){
require $file;
return true;
}
}
return false;
}
}
spl_autoload_register(array('Autoload','load'));
*/

309
README-ZH.md Normal file
View File

@@ -0,0 +1,309 @@
<p align="center">
<img width="150" src="logo.png" alt="QueryList">
<br>
<br>
</p>
# QueryList 简介
`QueryList`是一套简洁、优雅、可扩展的PHP采集工具(爬虫)基于phpQuery。
## 特性
- 拥有与jQuery完全相同的CSS3 DOM选择器
- 拥有与jQuery完全相同的DOM操作API
- 拥有通用的列表采集方案
- 拥有强大的HTTP请求套件轻松实现如模拟登陆、伪造浏览器、HTTP代理等意复杂的网络请求
- 拥有乱码解决方案
- 拥有强大的内容过滤功能可使用jQuey选择器来过滤内容
- 拥有高度的模块化设计,扩展性强
- 拥有富有表现力的API
- 拥有高质量文档
- 拥有丰富的插件
- 拥有专业的问答社区和交流群
通过插件可以轻松实现诸如:
- 多线程采集
- 采集JavaScript动态渲染的页面 (PhantomJS/headless WebKit)
- 图片本地化
- 模拟浏览器行为提交Form表单
- 网络爬虫
- .....
## 环境要求
- PHP >= 7.1
> 如果你的PHP版本还停留在PHP5或者不会使用Composer,你可以选择使用QueryList3,QueryList3支持php5.3以及手动安装。
QueryList3 文档:http://v3.querylist.cc
## 安装
通过Composer安装:
```
composer require jaeger/querylist
```
## 使用
#### 元素操作
- 采集「昵图网」所有图片地址
```php
QueryList::get('http://www.nipic.com')->find('img')->attrs('src');
```
- 采集百度搜索结果
```php
$ql = QueryList::get('http://www.baidu.com/s?wd=QueryList');
$ql->find('title')->text(); // 获取网站标题
$ql->find('meta[name=keywords]')->content; // 获取网站头部关键词
$ql->find('h3>a')->texts(); //获取搜索结果标题列表
$ql->find('h3>a')->attrs('href'); //获取搜索结果链接列表
$ql->find('img')->src; //获取第一张图片的链接地址
$ql->find('img:eq(1)')->src; //获取第二张图片的链接地址
$ql->find('img')->eq(2)->src; //获取第三张图片的链接地址
// 遍历所有图片
$ql->find('img')->map(function($img){
echo $img->alt; //打印图片的alt属性
});
```
- 更多用法
```php
$ql->find('#head')->append('<div>追加内容</div>')->find('div')->htmls();
$ql->find('.two')->children('img')->attrs('alt'); //获取class为two元素下的所有img孩子节点
//遍历class为two元素下的所有孩子节点
$data = $ql->find('.two')->children()->map(function ($item){
//用is判断节点类型
if($item->is('a')){
return $item->text();
}elseif($item->is('img'))
{
return $item->alt;
}
});
$ql->find('a')->attr('href', 'newVal')->removeClass('className')->html('newHtml')->...
$ql->find('div > p')->add('div > ul')->filter(':has(a)')->find('p:first')->nextAll()->andSelf()->...
$ql->find('div.old')->replaceWith( $ql->find('div.new')->clone())->appendTo('.trash')->prepend('Deleted')->...
```
#### 列表采集
采集百度搜索结果列表的标题和链接:
```php
$data = QueryList::get('http://www.baidu.com/s?wd=QueryList')
// 设置采集规则
->rules([
'title'=>array('h3','text'),
'link'=>array('h3>a','href')
])
->query()->getData();
print_r($data->all());
```
采集结果:
```
Array
(
[0] => Array
(
[title] => QueryList|基于phpQuery的无比强大的PHP采集工具
[link] => http://www.baidu.com/link?url=GU_YbDT2IHk4ns1tjG2I8_vjmH0SCJEAPuuZN
)
[1] => Array
(
[title] => PHP 用QueryList抓取网页内容 - wb145230 - 博客园
[link] => http://www.baidu.com/link?url=zn0DXBnrvIF2ibRVW34KcRVFG1_bCdZvqvwIhUqiXaS
)
[2] => Array
(
[title] => 介绍- QueryList指导文档
[link] => http://www.baidu.com/link?url=pSypvMovqS4v2sWeQo5fDBJ4EoYhXYi0Lxx
)
//...
)
```
#### 编码转换
```php
// 输出编码:UTF-8,输入编码:GB2312
QueryList::get('https://top.etao.com')->encoding('UTF-8','GB2312')->find('a')->texts();
// 输出编码:UTF-8,输入编码:自动识别
QueryList::get('https://top.etao.com')->encoding('UTF-8')->find('a')->texts();
```
#### HTTP网络操作GuzzleHttp
- 携带cookie登录新浪微博
```php
//采集新浪微博需要登录才能访问的页面
$ql = QueryList::get('http://weibo.com','param1=testvalue & params2=somevalue',[
'headers' => [
//填写从浏览器获取到的cookie
'Cookie' => 'SINAGLOBAL=546064; wb_cmtLike_2112031=1; wvr=6;....'
]
]);
//echo $ql->getHtml();
echo $ql->find('title')->text();
//输出: 我的首页 微博-随时随地发现新鲜事
```
- 使用Http代理
```php
$urlParams = ['param1' => 'testvalue','params2' => 'somevalue'];
$opts = [
// 设置http代理
'proxy' => 'http://222.141.11.17:8118',
//设置超时时间,单位:秒
'timeout' => 30,
// 伪造http头
'headers' => [
'Referer' => 'https://querylist.cc/',
'User-Agent' => 'testing/1.0',
'Accept' => 'application/json',
'X-Foo' => ['Bar', 'Baz'],
'Cookie' => 'abc=111;xxx=222'
]
];
$ql->get('http://httpbin.org/get',$urlParams,$opts);
// echo $ql->getHtml();
```
- 模拟登录
```php
// 用post登录
$ql = QueryList::post('http://xxxx.com/login',[
'username' => 'admin',
'password' => '123456'
])->get('http://xxx.com/admin');
//采集需要登录才能访问的页面
$ql->get('http://xxx.com/admin/page');
//echo $ql->getHtml();
```
#### Form表单操作
模拟登陆GitHub
```php
// 获取QueryList实例
$ql = QueryList::getInstance();
//获取到登录表单
$form = $ql->get('https://github.com/login')->find('form');
//填写GitHub用户名和密码
$form->find('input[name=login]')->val('your github username or email');
$form->find('input[name=password]')->val('your github password');
//序列化表单数据
$fromData = $form->serializeArray();
$postData = [];
foreach ($fromData as $item) {
$postData[$item['name']] = $item['value'];
}
//提交登录表单
$actionUrl = 'https://github.com'.$form->attr('action');
$ql->post($actionUrl,$postData);
//判断登录是否成功
// echo $ql->getHtml();
$userName = $ql->find('.header-nav-current-user>.css-truncate-target')->text();
if($userName)
{
echo '登录成功!欢迎你:'.$userName;
}else{
echo '登录失败!';
}
```
#### Bind功能扩展
自定义扩展一个`myHttp`方法:
```php
$ql = QueryList::getInstance();
//绑定一个myHttp方法到QueryList对象
$ql->bind('myHttp',function ($url){
// $this 为当前的QueryList对象
$html = file_get_contents($url);
$this->setHtml($html);
return $this;
});
//然后就可以通过注册的名字来调用
$data = $ql->myHttp('https://toutiao.io')->find('h3 a')->texts();
print_r($data->all());
```
或者把实现体封装到class然后这样绑定:
```php
$ql->bind('myHttp',function ($url){
return new MyHttp($this,$url);
});
```
#### 插件使用
- 使用PhantomJS插件采集JavaScript动态渲染的页面:
```php
// 安装时设置PhantomJS二进制文件路径
$ql = QueryList::use(PhantomJs::class,'/usr/local/bin/phantomjs');
// 采集今日头条手机版
$data = $ql->browser('https://m.toutiao.com')->find('p')->texts();
print_r($data->all());
// 使用HTTP代理
$ql->browser('https://m.toutiao.com',false,[
'--proxy' => '192.168.1.42:8080',
'--proxy-type' => 'http'
])
```
- 使用CURL多线程插件,多线程采集GitHub排行榜:
```php
$ql = QueryList::use(CurlMulti::class);
$ql->curlMulti([
'https://github.com/trending/php',
'https://github.com/trending/go',
//.....more urls
])
// 每个任务成功完成调用此回调
->success(function (QueryList $ql,CurlMulti $curl,$r){
echo "Current url:{$r['info']['url']} \r\n";
$data = $ql->find('h3 a')->texts();
print_r($data->all());
})
// 每个任务失败回调
->error(function ($errorInfo,CurlMulti $curl){
echo "Current url:{$errorInfo['info']['url']} \r\n";
print_r($errorInfo['error']);
})
->start([
// 最大并发数
'maxThread' => 10,
// 错误重试次数
'maxTry' => 3,
]);
```
## 插件
- [jae-jae/QueryList-PhantomJS](https://github.com/jae-jae/QueryList-PhantomJS): 使用PhantomJS采集JavaScript动态渲染的页面
- [jae-jae/QueryList-CurlMulti](https://github.com/jae-jae/QueryList-CurlMulti) : Curl多线程采集
- [jae-jae/QueryList-AbsoluteUrl](https://github.com/jae-jae/QueryList-AbsoluteUrl) : 转换URL相对路径到绝对路径
- [jae-jae/QueryList-Rule-Google](https://github.com/jae-jae/QueryList-Rule-Google) : 谷歌搜索引擎
- [jae-jae/QueryList-Rule-Baidu](https://github.com/jae-jae/QueryList-Rule-Baidu) : 百度搜索引擎
查看更多的QueryList插件和基于QueryList的产品:[QueryList社区力量](https://github.com/jae-jae/QueryList-Community)
## 贡献
欢迎为QueryList贡献代码。关于贡献插件可以查看:[QueryList插件贡献说明](https://github.com/jae-jae/QueryList-Community/blob/master/CONTRIBUTING.md)
## 寻求帮助?
- QueryList主页: [http://querylist.cc](http://querylist.cc/)
- QueryList文档: [http://doc.querylist.cc](http://doc.querylist.cc/)
- QueryList问答:[http://wenda.querylist.cc](http://wenda.querylist.cc/)
- QueryList交流QQ群:123266961 <a target="_blank" href="http://shang.qq.com/wpa/qunwpa?idkey=a1b248ae30b3f711bdab4f799df839300dc7fed54331177035efa0513da027f6"><img border="0" src="http://pub.idqqimg.com/wpa/images/group.png" alt="cafeEX" title="cafeEX"></a>
- GitHub:https://github.com/jae-jae/QueryList
- Git@OSC:http://git.oschina.net/jae/QueryList
## Author
Jaeger <JaegerCode@gmail.com>
## Lisence
QueryList is licensed under the license of MIT. See the LICENSE for more details.

315
README.md
View File

@@ -1,11 +1,304 @@
#QueryList交流社区: [http://querylist.cc/](http://querylist.cc/)
#QueryList交流QQ群:123266961 <a target="_blank" href="http://shang.qq.com/wpa/qunwpa?idkey=a1b248ae30b3f711bdab4f799df839300dc7fed54331177035efa0513da027f6"><img border="0" src="http://pub.idqqimg.com/wpa/images/group.png" alt="╰☆邪恶 魔方☆" title="╰☆邪恶 魔方☆"></a>
#QueryList简介
***
QueryList是一个基于phpQuery的通用列表采集类,是一个简单、 灵活、强大的采集工具,采集任何复杂的页面 基本上就一句话就能搞定了。
#QueryList 使用
```php
//获取采集对象
$hj = QueryList::Query('http://www.baidu.com/s?wd=QueryList',array('title'=>array('h3','text'),'link'=>array('h3>a','href')));
<p align="center">
<img width="150" src="logo.png" alt="QueryList">
<br>
<br>
</p>
# QueryList
`QueryList` is a simple, elegant, extensible PHP Web Scraper (crawler/spider) ,based on phpQuery.
[API Documentation](https://github.com/jae-jae/QueryList/wiki)
[中文文档](README-ZH.md)
## Features
- Have the same CSS3 DOM selector as jQuery
- Have the same DOM manipulation API as jQuery
- Have a generic list crawling program
- Have a strong HTTP request suite, easy to achieve such as: simulated landing, forged browser, HTTP proxy and other complex network requests
- Have a messy code solution
- Have powerful content filtering, you can use the jQuey selector to filter content
- Has a high degree of modular design, scalability and strong
- Have an expressive API
- Has a wealth of plug-ins
Through plug-ins you can easily implement things like:
- Multithreaded crawl
- Crawl JavaScript dynamic rendering page (PhantomJS/headless WebKit)
- Image downloads to local
- Simulate browser behavior such as submitting Form forms
- Web crawler
- .....
## Requirements
- PHP >= 7.1
## Installation
By Composer installation:
```
composer require jaeger/querylist
```
## Usage
#### DOM Traversal and Manipulation
- Crawl「GitHub」all picture links
```php
QueryList::get('https://github.com')->find('img')->attrs('src');
```
- Crawl Google search results
```php
$ql = QueryList::get('https://www.google.co.jp/search?q=QueryList');
$ql->find('title')->text(); //The page title
$ql->find('meta[name=keywords]')->content; //The page keywords
$ql->find('h3>a')->texts(); //Get a list of search results titles
$ql->find('h3>a')->attrs('href'); //Get a list of search results links
$ql->find('img')->src; //Gets the link address of the first image
$ql->find('img:eq(1)')->src; //Gets the link address of the second image
$ql->find('img')->eq(2)->src; //Gets the link address of the third image
// Loop all the images
$ql->find('img')->map(function($img){
echo $img->alt; //Print the alt attribute of the image
});
```
- More usage
```php
$ql->find('#head')->append('<div>Append content</div>')->find('div')->htmls();
$ql->find('.two')->children('img')->attrs('alt'); // Get the class is the "two" element under all img child nodes
// Loop class is the "two" element under all child nodes
$data = $ql->find('.two')->children()->map(function ($item){
// Use "is" to determine the node type
if($item->is('a')){
return $item->text();
}elseif($item->is('img'))
{
return $item->alt;
}
});
$ql->find('a')->attr('href', 'newVal')->removeClass('className')->html('newHtml')->...
$ql->find('div > p')->add('div > ul')->filter(':has(a)')->find('p:first')->nextAll()->andSelf()->...
$ql->find('div.old')->replaceWith( $ql->find('div.new')->clone())->appendTo('.trash')->prepend('Deleted')->...
```
#### List crawl
Crawl the title and link of the Google search results list:
```php
$data = QueryList::get('https://www.google.co.jp/search?q=QueryList')
// Set the crawl rules
->rules([
'title'=>array('h3','text'),
'link'=>array('h3>a','href')
])
->query()->getData();
print_r($data->all());
```
Results:
```
Array
(
[0] => Array
(
[title] => Angular - QueryList
[link] => https://angular.io/api/core/QueryList
)
[1] => Array
(
[title] => QueryList | @angular/core - Angularリファレンス - Web Creative Park
[link] => http://www.webcreativepark.net/angular/querylist/
)
[2] => Array
(
[title] => QueryListにQueryを追加したり、追加されたことを感知する | TIPS ...
[link] => http://www.webcreativepark.net/angular/querylist_query_add_subscribe/
)
//...
)
```
#### Encode convert
```php
// Out charset :UTF-8
// In charset :GB2312
QueryList::get('https://top.etao.com')->encoding('UTF-8','GB2312')->find('a')->texts();
// Out charset:UTF-8
// In charset:Automatic Identification
QueryList::get('https://top.etao.com')->encoding('UTF-8')->find('a')->texts();
```
#### HTTP Client (GuzzleHttp)
- Carry cookie login GitHub
```php
//Crawl GitHub content
$ql = QueryList::get('https://github.com','param1=testvalue & params2=somevalue',[
'headers' => [
// Fill in the cookie from the browser
'Cookie' => 'SINAGLOBAL=546064; wb_cmtLike_2112031=1; wvr=6;....'
]
]);
//echo $ql->getHtml();
$userName = $ql->find('.header-nav-current-user>.css-truncate-target')->text();
echo $userName;
```
- Use the Http proxy
```php
$urlParams = ['param1' => 'testvalue','params2' => 'somevalue'];
$opts = [
// Set the http proxy
'proxy' => 'http://222.141.11.17:8118',
//Set the timeout time in seconds
'timeout' => 30,
// Fake HTTP headers
'headers' => [
'Referer' => 'https://querylist.cc/',
'User-Agent' => 'testing/1.0',
'Accept' => 'application/json',
'X-Foo' => ['Bar', 'Baz'],
'Cookie' => 'abc=111;xxx=222'
]
];
$ql->get('http://httpbin.org/get',$urlParams,$opts);
// echo $ql->getHtml();
```
- Analog login
```php
// Post login
$ql = QueryList::post('http://xxxx.com/login',[
'username' => 'admin',
'password' => '123456'
])->get('http://xxx.com/admin');
// Crawl pages that need to be logged in to access
$ql->get('http://xxx.com/admin/page');
//echo $ql->getHtml();
```
#### Submit forms
Login GitHub
```php
// Get the QueryList instance
$ql = QueryList::getInstance();
// Get the login form
$form = $ql->get('https://github.com/login')->find('form');
// Fill in the GitHub username and password
$form->find('input[name=login]')->val('your github username or email');
$form->find('input[name=password]')->val('your github password');
// Serialize the form data
$fromData = $form->serializeArray();
$postData = [];
foreach ($fromData as $item) {
$postData[$item['name']] = $item['value'];
}
// Submit the login form
$actionUrl = 'https://github.com'.$form->attr('action');
$ql->post($actionUrl,$postData);
// To determine whether the login is successful
// echo $ql->getHtml();
$userName = $ql->find('.header-nav-current-user>.css-truncate-target')->text();
if($userName)
{
echo 'Login successful ! Welcome:'.$userName;
}else{
echo 'Login failed !';
}
```
#### Bind function extension
Customize the extension of a `myHttp` method:
```php
$ql = QueryList::getInstance();
//Bind a `myHttp` method to the QueryList object
$ql->bind('myHttp',function ($url){
// $this is the current QueryList object
$html = file_get_contents($url);
$this->setHtml($html);
return $this;
});
// And then you can call by the name of the binding
$data = $ql->myHttp('https://toutiao.io')->find('h3 a')->texts();
print_r($data->all());
```
Or package to class, and then bind:
```php
$ql->bind('myHttp',function ($url){
return new MyHttp($this,$url);
});
```
#### Plugin used
- Use the PhantomJS plugin to crawl JavaScript dynamically rendered pages:
```php
// Set the PhantomJS binary file path during installation
$ql = QueryList::use(PhantomJs::class,'/usr/local/bin/phantomjs');
// Crawl「500px」all picture links
$data = $ql->browser('https://500px.com/editors')->find('img')->attrs('src');
print_r($data->all());
// Use the HTTP proxy
$ql->browser('https://500px.com/editors',false,[
'--proxy' => '192.168.1.42:8080',
'--proxy-type' => 'http'
])
```
- Using the CURL multithreading plug-in, multi-threaded crawling GitHub trending :
```php
$ql = QueryList::use(CurlMulti::class);
$ql->curlMulti([
'https://github.com/trending/php',
'https://github.com/trending/go',
//.....more urls
])
// Called if task is success
->success(function (QueryList $ql,CurlMulti $curl,$r){
echo "Current url:{$r['info']['url']} \r\n";
$data = $ql->find('h3 a')->texts();
print_r($data->all());
})
// Task fail callback
->error(function ($errorInfo,CurlMulti $curl){
echo "Current url:{$errorInfo['info']['url']} \r\n";
print_r($errorInfo['error']);
})
->start([
// Maximum number of threads
'maxThread' => 10,
// Number of error retries
'maxTry' => 3,
]);
```
## Plugins
- [jae-jae/QueryList-PhantomJS](https://github.com/jae-jae/QueryList-PhantomJS):Use PhantomJS to crawl Javascript dynamically rendered page.
- [jae-jae/QueryList-CurlMulti](https://github.com/jae-jae/QueryList-CurlMulti) : Curl multi threading.
- [jae-jae/QueryList-AbsoluteUrl](https://github.com/jae-jae/QueryList-AbsoluteUrl) : Converting relative urls to absolute.
- [jae-jae/QueryList-Rule-Google](https://github.com/jae-jae/QueryList-Rule-Google) : Google searcher.
- [jae-jae/QueryList-Rule-Baidu](https://github.com/jae-jae/QueryList-Rule-Baidu) : Baidu searcher.
View more QueryList plugins and QueryList-based products: [QueryList Community](https://github.com/jae-jae/QueryList-Community)
## Contributing
Welcome to contribute code for the QueryList。About Contributing Plugins can be viewed:[QueryList Plugin Contributing Guide](https://github.com/jae-jae/QueryList-Community/blob/master/CONTRIBUTING.md)
## Author
Jaeger <JaegerCode@gmail.com>
If this library is useful for you, say thanks [buying me a beer :beer:](https://www.paypal.me/jaepay)!
## Lisence
QueryList is licensed under the license of MIT. See the LICENSE for more details.

View File

@@ -1,20 +1,40 @@
{
"name": "jaeger/querylist",
"description": "QueryList是基于phpQuery的无比强大的PHP采集工具",
"description": "Simple, elegant, extensible PHP Web Scraper (crawler/spider),Use the css3 dom selector,Based on phpQuery! 简洁、优雅、可扩展的PHP采集工具(爬虫)基于phpQuery。",
"keywords":["QueryList","phpQuery","spider"],
"homepage": "http://querylist.cc",
"require": {
"PHP":">=5.3.0",
"jaeger/phpquery-single": "^0.9.5"
"PHP":">=7.1",
"jaeger/phpquery-single": "^1",
"jaeger/g-http": "^1.1",
"ext-dom": "*",
"tightenco/collect": ">5.0"
},
"suggest":{
},
"license": "MIT",
"authors": [
{
"name": "Jaeger",
"email": "hj.q@qq.com"
"email": "JaegerCode@gmail.com"
}
],
"autoload":{
"psr-4":{
"QL\\":""
"QL\\":"src"
}
},
"autoload-dev": {
"psr-4": {
"Tests\\": "tests/"
}
},
"require-dev": {
"symfony/var-dumper": "^3.3",
"phpunit/phpunit": "^8.5"
},
"scripts": {
"test": "./vendor/bin/phpunit"
}
}

BIN
logo.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 15 KiB

19
phpunit.xml Normal file
View File

@@ -0,0 +1,19 @@
<phpunit
bootstrap="vendor/autoload.php"
convertErrorsToExceptions="true"
convertNoticesToExceptions="true"
convertWarningsToExceptions="true"
>
<testsuites>
<testsuite name="querylist">
<directory>./tests</directory>
</testsuite>
</testsuites>
<filter>
<whitelist>
<directory suffix=".php">src</directory>
</whitelist>
</filter>
</phpunit>

94
src/Config.php Normal file
View File

@@ -0,0 +1,94 @@
<?php
/**
* Created by PhpStorm.
* User: Jaeger <JaegerCode@gmail.com>
* Date: 2017/9/22
*/
namespace QL;
use Closure;
use Tightenco\Collect\Support\Collection;
class Config
{
protected static $instance = null;
protected $plugins;
protected $binds;
/**
* Config constructor.
*/
public function __construct()
{
$this->plugins = new Collection();
$this->binds = new Collection();
}
/**
* Get the Config instance
*
* @return null|Config
*/
public static function getInstance()
{
self::$instance || self::$instance = new self();
return self::$instance;
}
/**
* Global installation plugin
*
* @param $plugins
* @param array ...$opt
* @return $this
*/
public function use($plugins,...$opt)
{
if(is_string($plugins)){
$this->plugins->push([$plugins,$opt]);
}else{
$this->plugins = $this->plugins->merge($plugins);
}
return $this;
}
/**
* Global binding custom method
*
* @param string $name
* @param Closure $provider
* @return $this
*/
public function bind(string $name, Closure $provider)
{
$this->binds[$name] = $provider;
return $this;
}
public function bootstrap(QueryList $queryList)
{
$this->installPlugins($queryList);
$this->installBind($queryList);
}
protected function installPlugins(QueryList $queryList)
{
$this->plugins->each(function($plugin) use($queryList){
if(is_string($plugin)){
$queryList->use($plugin);
}else{
$queryList->use($plugin[0],...$plugin[1]);
}
});
}
protected function installBind(QueryList $queryList)
{
$this->binds->each(function ($provider,$name) use($queryList){
$queryList->bind($name,$provider);
});
}
}

View File

@@ -0,0 +1,15 @@
<?php
/**
* Created by PhpStorm.
* User: Jaeger <JaegerCode@gmail.com>
* Date: 2017/9/22
*/
namespace QL\Contracts;
use QL\QueryList;
interface PluginContract
{
public static function install(QueryList $queryList,...$opt);
}

View File

@@ -0,0 +1,15 @@
<?php
/**
* Created by PhpStorm.
* User: Jaeger <JaegerCode@gmail.com>
* Date: 2017/9/20
*/
namespace QL\Contracts;
use QL\Kernel;
interface ServiceProviderContract
{
public function register(Kernel $kernel);
}

30
src/Dom/Dom.php Normal file
View File

@@ -0,0 +1,30 @@
<?php
/**
* Created by PhpStorm.
* User: Jaeger <JaegerCode@gmail.com>
* Date: 2017/9/19
*/
namespace QL\Dom;
use phpQueryObject;
class Dom
{
protected $document;
/**
* Dom constructor.
*/
public function __construct(phpQueryObject $document)
{
$this->document = $document;
}
public function find($selector)
{
$elements = $this->document->find($selector);
return new Elements($elements);
}
}

260
src/Dom/Elements.php Normal file
View File

@@ -0,0 +1,260 @@
<?php
/**
* Created by PhpStorm.
* User: Jaeger <JaegerCode@gmail.com>
* Date: 2017/9/19
*/
namespace QL\Dom;
use phpDocumentor\Reflection\Types\Null_;
use phpQueryObject;
use Tightenco\Collect\Support\Collection;
/**
* Class Elements
* @package QL\Dom
*
* @method Elements toReference(&$var)
* @method Elements documentFragment($state = null)
* @method Elements toRoot()
* @method Elements getDocumentIDRef(&$documentID)
* @method Elements getDocument()
* @method \DOMDocument getDOMDocument()
* @method Elements getDocumentID()
* @method Elements unloadDocument()
* @method bool isHTML()
* @method bool isXHTML()
* @method bool isXML()
* @method string serialize()
* @method array serializeArray($submit = null)
* @method \DOMElement|\DOMElement[] get($index = null, $callback1 = null, $callback2 = null, $callback3 = null)
* @method string|array getString($index = null, $callback1 = null, $callback2 = null, $callback3 = null)
* @method string|array getStrings($index = null, $callback1 = null, $callback2 = null, $callback3 = null)
* @method Elements newInstance($newStack = null)
* @method Elements find($selectors, $context = null, $noHistory = false)
* @method Elements|bool is($selector, $nodes = null)
* @method Elements filterCallback($callback, $_skipHistory = false)
* @method Elements filter($selectors, $_skipHistory = false)
* @method Elements load($url, $data = null, $callback = null)
* @method Elements trigger($type, $data = [])
* @method Elements triggerHandler($type, $data = [])
* @method Elements bind($type, $data, $callback = null)
* @method Elements unbind($type = null, $callback = null)
* @method Elements change($callback = null)
* @method Elements submit($callback = null)
* @method Elements click($callback = null)
* @method Elements wrapAllOld($wrapper)
* @method Elements wrapAll($wrapper)
* @method Elements wrapAllPHP($codeBefore, $codeAfter)
* @method Elements wrap($wrapper)
* @method Elements wrapPHP($codeBefore, $codeAfter)
* @method Elements wrapInner($wrapper)
* @method Elements wrapInnerPHP($codeBefore, $codeAfter)
* @method Elements contents()
* @method Elements contentsUnwrap()
* @method Elements switchWith($markup)
* @method Elements eq($num)
* @method Elements size()
* @method Elements length()
* @method int count()
* @method Elements end($level = 1)
* @method Elements _clone()
* @method Elements replaceWithPHP($code)
* @method Elements replaceWith($content)
* @method Elements replaceAll($selector)
* @method Elements remove($selector = null)
* @method Elements|string markup($markup = null, $callback1 = null, $callback2 = null, $callback3 = null)
* @method string markupOuter($callback1 = null, $callback2 = null, $callback3 = null)
* @method Elements|string html($html = null, $callback1 = null, $callback2 = null, $callback3 = null)
* @method Elements|string xml($xml = null, $callback1 = null, $callback2 = null, $callback3 = null)
* @method string htmlOuter($callback1 = null, $callback2 = null, $callback3 = null)
* @method string xmlOuter($callback1 = null, $callback2 = null, $callback3 = null)
* @method Elements php($code)
* @method string markupPHP($code)
* @method string markupOuterPHP()
* @method Elements children($selector)
* @method Elements ancestors($selector)
* @method Elements append($content)
* @method Elements appendPHP($content)
* @method Elements appendTo($seletor)
* @method Elements prepend($content)
* @method Elements prependPHP($content)
* @method Elements prependTo($seletor)
* @method Elements before($content)
* @method Elements beforePHP($content)
* @method Elements insertBefore($seletor)
* @method Elements after($content)
* @method Elements afterPHP($content)
* @method Elements insertAfter($seletor)
* @method Elements insert($target, $type)
* @method int index($subject)
* @method Elements slice($start, $end = null)
* @method Elements reverse()
* @method Elements|string text($text = null, $callback1 = null, $callback2 = null, $callback3 = null)
* @method Elements plugin($class, $file = null)
* @method Elements _next($selector = null)
* @method Elements _prev($selector = null)
* @method Elements prev($selector = null)
* @method Elements prevAll($selector = null)
* @method Elements nextAll($selector = null)
* @method Elements siblings($selector = null)
* @method Elements not($selector = null)
* @method Elements add($selector = null)
* @method Elements parent($selector = null)
* @method Elements parents($selector = null)
* @method Elements stack($nodeTypes = null)
* @method Elements|string attr($attr = null, $value = null)
* @method Elements attrPHP($attr, $code)
* @method Elements removeAttr($attr)
* @method Elements|string val($val = null)
* @method Elements andSelf()
* @method Elements addClass($className)
* @method Elements addClassPHP($className)
* @method bool hasClass($className)
* @method Elements removeClass($className)
* @method Elements toggleClass($className)
* @method Elements _empty()
* @method Elements callback($callback, $param1 = null, $param2 = null, $param3 = null)
* @method string data($key, $value = null)
* @method Elements removeData($key)
* @method void rewind()
* @method Elements current()
* @method int key()
* @method Elements next($cssSelector = null)
* @method bool valid()
* @method bool offsetExists($offset)
* @method Elements offsetGet($offset)
* @method void offsetSet($offset, $value)
* @method string whois($oneNode)
* @method Elements dump()
* @method Elements dumpWhois()
* @method Elements dumpLength()
* @method Elements dumpTree($html, $title)
* @method dumpDie()
*/
class Elements
{
/**
* @var phpQueryObject
*/
protected $elements;
/**
* Elements constructor.
* @param $elements
*/
public function __construct(phpQueryObject $elements)
{
$this->elements = $elements;
}
public function __get($name)
{
return property_exists($this->elements, $name) ? $this->elements->$name : $this->elements->attr($name);
}
public function __call($name, $arguments)
{
$obj = call_user_func_array([$this->elements, $name], $arguments);
if ($obj instanceof phpQueryObject) {
$obj = new self($obj);
} else if (is_string($obj)) {
$obj = trim($obj);
}
return $obj;
}
/**
* Iterating elements
*
* @param callable $callback
*
* @return $this
*/
public function each(callable $callback)
{
foreach ($this->elements as $key => $element) {
$break = $callback(new self(pq($element)), $key);
if ($break === false) {
break;
}
}
return $this;
}
/**
* Iterating elements
*
* @param $callback
* @return \Illuminate\Support\Collection|\Tightenco\Collect\Support\Collection
*/
public function map($callback)
{
$collection = new Collection();
$this->elements->each(function ($dom) use (& $collection, $callback) {
$collection->push($callback(new self(pq($dom))));
});
return $collection;
}
/**
* Gets the attributes of all the elements
*
* @param string $attr HTML attribute name
* @return \Illuminate\Support\Collection|\Tightenco\Collect\Support\Collection
*/
public function attrs($attr)
{
return $this->map(function ($item) use ($attr) {
return $item->attr($attr);
});
}
/**
* Gets the text of all the elements
*
* @return \Illuminate\Support\Collection|\Tightenco\Collect\Support\Collection
*/
public function texts()
{
return $this->map(function ($item) {
return trim($item->text());
});
}
/**
* Gets the html of all the elements
*
* @return \Illuminate\Support\Collection|\Tightenco\Collect\Support\Collection
*/
public function htmls()
{
return $this->map(function ($item) {
return trim($item->html());
});
}
/**
* Gets the htmlOuter of all the elements
*
* @return \Illuminate\Support\Collection|\Tightenco\Collect\Support\Collection
*/
public function htmlOuters()
{
return $this->map(function ($item) {
return trim($item->htmlOuter());
});
}
/**
* @return phpQueryObject
*/
public function getElements(): phpQueryObject
{
return $this->elements;
}
}

322
src/Dom/Query.php Normal file
View File

@@ -0,0 +1,322 @@
<?php
/**
* Created by PhpStorm.
* User: Jaeger <JaegerCode@gmail.com>
* Date: 2017/9/21
*/
namespace QL\Dom;
use Tightenco\Collect\Support\Collection;
use phpQuery;
use phpQueryObject;
use QL\QueryList;
use Closure;
class Query
{
protected $html;
/**
* @var \phpQueryObject
*/
protected $document;
protected $rules;
protected $range = null;
protected $ql;
/**
* @var Collection
*/
protected $data;
public function __construct(QueryList $ql)
{
$this->ql = $ql;
}
/**
* @param bool $rel
* @return String
*/
public function getHtml($rel = true)
{
return $rel ? $this->document->htmlOuter() : $this->html;
}
/**
* @param $html
* @param null $charset
* @return QueryList
*/
public function setHtml($html, $charset = null)
{
$this->html = value($html);
$this->destroyDocument();
$this->document = phpQuery::newDocumentHTML($this->html, $charset);
return $this->ql;
}
/**
* Get crawl results
*
* @param Closure|null $callback
* @return Collection|static
*/
public function getData(Closure $callback = null)
{
return $this->handleData($this->data, $callback);
}
/**
* @param Collection $data
*/
public function setData(Collection $data)
{
$this->data = $data;
}
/**
* Searches for all elements that match the specified expression.
*
* @param $selector A string containing a selector expression to match elements against.
* @return Elements
*/
public function find($selector)
{
return (new Dom($this->document))->find($selector);
}
/**
* Set crawl rule
*
* $rules = [
* 'rule_name1' => ['selector','HTML attribute | text | html','Tag filter list','callback'],
* 'rule_name2' => ['selector','HTML attribute | text | html','Tag filter list','callback'],
* // ...
* ]
*
* @param array $rules
* @return QueryList
*/
public function rules(array $rules)
{
$this->rules = $rules;
return $this->ql;
}
/**
* Set the slice area for crawl list
*
* @param $selector
* @return QueryList
*/
public function range($selector)
{
$this->range = $selector;
return $this->ql;
}
/**
* Remove HTML head,try to solve the garbled
*
* @return QueryList
*/
public function removeHead()
{
$html = preg_replace('/(<head>|<head\s+.+?>).+?<\/head>/is', '<head></head>', $this->html);
$html && $this->setHtml($html);
return $this->ql;
}
/**
* Execute the query rule
*
* @param Closure|null $callback
* @return QueryList
*/
public function query(Closure $callback = null)
{
$this->data = $this->getList();
$this->data = $this->handleData($this->data, $callback);
return $this->ql;
}
public function handleData(Collection $data, $callback)
{
if (is_callable($callback)) {
if (empty($this->range)) {
$data = new Collection($callback($data->all(), null));
} else {
$data = $data->map($callback);
}
}
return $data;
}
protected function getList()
{
$data = [];
if (empty($this->range)) {
foreach ($this->rules as $key => $reg_value) {
$rule = $this->parseRule($reg_value);
$contentElements = $this->document->find($rule['selector']);
$data[$key] = $this->extractContent($contentElements, $key, $rule);
}
} else {
$rangeElements = $this->document->find($this->range);
$i = 0;
foreach ($rangeElements as $element) {
foreach ($this->rules as $key => $reg_value) {
$rule = $this->parseRule($reg_value);
$contentElements = pq($element)->find($rule['selector']);
$data[$i][$key] = $this->extractContent($contentElements, $key, $rule);
}
$i++;
}
}
return new Collection($data);
}
protected function extractContent(phpQueryObject $pqObj, $ruleName, $rule)
{
switch ($rule['attr']) {
case 'text':
$content = $this->allowTags($pqObj->html(), $rule['filter_tags']);
break;
case 'texts':
$content = (new Elements($pqObj))->map(function (Elements $element) use ($rule) {
return $this->allowTags($element->html(), $rule['filter_tags']);
})->all();
break;
case 'html':
$content = $this->stripTags($pqObj->html(), $rule['filter_tags']);
break;
case 'htmls':
$content = (new Elements($pqObj))->map(function (Elements $element) use ($rule) {
return $this->stripTags($element->html(), $rule['filter_tags']);
})->all();
break;
case 'htmlOuter':
$content = $this->stripTags($pqObj->htmlOuter(), $rule['filter_tags']);
break;
case 'htmlOuters':
$content = (new Elements($pqObj))->map(function (Elements $element) use ($rule) {
return $this->stripTags($element->htmlOuter(), $rule['filter_tags']);
})->all();
break;
default:
if(preg_match('/attr\((.+)\)/', $rule['attr'], $arr)) {
$content = $pqObj->attr($arr[1]);
} elseif (preg_match('/attrs\((.+)\)/', $rule['attr'], $arr)) {
$content = (new Elements($pqObj))->attrs($arr[1])->all();
} else {
$content = $pqObj->attr($rule['attr']);
}
break;
}
if (is_callable($rule['handle_callback'])) {
$content = call_user_func($rule['handle_callback'], $content, $ruleName);
}
return $content;
}
protected function parseRule($rule)
{
$result = [];
$result['selector'] = $rule[0];
$result['attr'] = $rule[1];
$result['filter_tags'] = $rule[2] ?? '';
$result['handle_callback'] = $rule[3] ?? null;
return $result;
}
/**
* 去除特定的html标签
* @param string $html
* @param string $tags_str 多个标签名之间用空格隔开
* @return string
*/
protected function stripTags($html, $tags_str)
{
$tagsArr = $this->tag($tags_str);
$html = $this->removeTags($html, $tagsArr[1]);
$p = array();
foreach ($tagsArr[0] as $tag) {
$p[] = "/(<(?:\/" . $tag . "|" . $tag . ")[^>]*>)/i";
}
$html = preg_replace($p, "", trim($html));
return $html;
}
/**
* 保留特定的html标签
* @param string $html
* @param string $tags_str 多个标签名之间用空格隔开
* @return string
*/
protected function allowTags($html, $tags_str)
{
$tagsArr = $this->tag($tags_str);
$html = $this->removeTags($html, $tagsArr[1]);
$allow = '';
foreach ($tagsArr[0] as $tag) {
$allow .= "<$tag> ";
}
return strip_tags(trim($html), $allow);
}
protected function tag($tags_str)
{
$tagArr = preg_split("/\s+/", $tags_str, -1, PREG_SPLIT_NO_EMPTY);
$tags = array(array(), array());
foreach ($tagArr as $tag) {
if (preg_match('/-(.+)/', $tag, $arr)) {
array_push($tags[1], $arr[1]);
} else {
array_push($tags[0], $tag);
}
}
return $tags;
}
/**
* 移除特定的html标签
* @param string $html
* @param array $tags 标签数组
* @return string
*/
protected function removeTags($html, $tags)
{
$tag_str = '';
if (count($tags)) {
foreach ($tags as $tag) {
$tag_str .= $tag_str ? ',' . $tag : $tag;
}
// phpQuery::$defaultCharset = $this->inputEncoding?$this->inputEncoding:$this->htmlEncoding;
$doc = phpQuery::newDocumentHTML($html);
pq($doc)->find($tag_str)->remove();
$html = pq($doc)->htmlOuter();
$doc->unloadDocument();
}
return $html;
}
protected function destroyDocument()
{
if ($this->document instanceof phpQueryObject) {
$this->document->unloadDocument();
}
}
public function __destruct()
{
$this->destroyDocument();
}
}

View File

@@ -0,0 +1,15 @@
<?php
/**
* Created by PhpStorm.
* User: Jaeger <JaegerCode@gmail.com>
* Date: 2017/9/21
*/
namespace QL\Exceptions;
use Exception;
class ServiceNotFoundException extends Exception
{
}

74
src/Kernel.php Normal file
View File

@@ -0,0 +1,74 @@
<?php
/**
* Created by PhpStorm.
* User: Jaeger <JaegerCode@gmail.com>
* Date: 2017/9/21
*/
namespace QL;
use QL\Contracts\ServiceProviderContract;
use QL\Exceptions\ServiceNotFoundException;
use QL\Providers\EncodeServiceProvider;
use Closure;
use QL\Providers\HttpServiceProvider;
use QL\Providers\PluginServiceProvider;
use QL\Providers\SystemServiceProvider;
use Tightenco\Collect\Support\Collection;
class Kernel
{
protected $providers = [
SystemServiceProvider::class,
HttpServiceProvider::class,
EncodeServiceProvider::class,
PluginServiceProvider::class
];
protected $binds;
protected $ql;
/**
* Kernel constructor.
* @param $ql
*/
public function __construct(QueryList $ql)
{
$this->ql = $ql;
$this->binds = new Collection();
}
public function bootstrap()
{
//注册服务提供者
$this->registerProviders();
return $this;
}
public function registerProviders()
{
foreach ($this->providers as $provider) {
$this->register(new $provider());
}
}
public function bind(string $name,Closure $provider)
{
$this->binds[$name] = $provider;
}
public function getService(string $name)
{
if(!$this->binds->offsetExists($name)){
throw new ServiceNotFoundException("Service: {$name} not found!");
}
return $this->binds[$name];
}
private function register(ServiceProviderContract $instance)
{
$instance->register($this);
}
}

View File

@@ -0,0 +1,22 @@
<?php
/**
* Created by PhpStorm.
* User: Jaeger <JaegerCode@gmail.com>
* Date: 2017/9/20
*/
namespace QL\Providers;
use QL\Contracts\ServiceProviderContract;
use QL\Kernel;
use QL\Services\EncodeService;
class EncodeServiceProvider implements ServiceProviderContract
{
public function register(Kernel $kernel)
{
$kernel->bind('encoding',function (string $outputEncoding,string $inputEncoding = null){
return EncodeService::convert($this,$outputEncoding,$inputEncoding);
});
}
}

View File

@@ -0,0 +1,40 @@
<?php
/**
* Created by PhpStorm.
* User: Jaeger <JaegerCode@gmail.com>
* Date: 2017/9/22
*/
namespace QL\Providers;
use QL\Contracts\ServiceProviderContract;
use QL\Kernel;
use QL\Services\HttpService;
use QL\Services\MultiRequestService;
class HttpServiceProvider implements ServiceProviderContract
{
public function register(Kernel $kernel)
{
$kernel->bind('get',function (...$args){
return HttpService::get($this,...$args);
});
$kernel->bind('post',function (...$args){
return HttpService::post($this,...$args);
});
$kernel->bind('postJson',function (...$args){
return HttpService::postJson($this,...$args);
});
$kernel->bind('multiGet',function (...$args){
return new MultiRequestService($this,'get',...$args);
});
$kernel->bind('multiPost',function (...$args){
return new MultiRequestService($this,'post',...$args);
});
}
}

View File

@@ -0,0 +1,23 @@
<?php
/**
* Created by PhpStorm.
* User: Jaeger <JaegerCode@gmail.com>
* Date: 2017/9/22
*/
namespace QL\Providers;
use QL\Contracts\ServiceProviderContract;
use QL\Kernel;
use QL\Services\PluginService;
class PluginServiceProvider implements ServiceProviderContract
{
public function register(Kernel $kernel)
{
$kernel->bind('use',function ($plugins,...$opt){
return PluginService::install($this,$plugins,...$opt);
});
}
}

View File

@@ -0,0 +1,32 @@
<?php
/**
* Created by PhpStorm.
* User: Jaeger <JaegerCode@gmail.com>
* Date: 2017/9/22
*/
namespace QL\Providers;
use QL\Contracts\ServiceProviderContract;
use QL\Kernel;
use Closure;
class SystemServiceProvider implements ServiceProviderContract
{
public function register(Kernel $kernel)
{
$kernel->bind('html',function (...$args){
$this->setHtml(...$args);
return $this;
});
$kernel->bind('queryData',function (Closure $callback = null){
return $this->query()->getData($callback)->all();
});
$kernel->bind('pipe',function (Closure $callback = null){
return $callback($this);
});
}
}

133
src/QueryList.php Normal file
View File

@@ -0,0 +1,133 @@
<?php
/**
* QueryList
*
* 一个基于phpQuery的通用列表采集类
*
* @author Jaeger
* @email JaegerCode@gmail.com
* @link https://github.com/jae-jae/QueryList
* @version 4.0.0
*
*/
namespace QL;
use phpQuery;
use QL\Dom\Query;
use Tightenco\Collect\Support\Collection;
use Closure;
use QL\Services\MultiRequestService;
/**
* Class QueryList
* @package QL
*
* @method string getHtml($rel = true)
* @method QueryList setHtml($html)
* @method QueryList html($html)
* @method Dom\Elements find($selector)
* @method QueryList rules(array $rules)
* @method QueryList range($range)
* @method QueryList removeHead()
* @method QueryList query(Closure $callback = null)
* @method Collection getData(Closure $callback = null)
* @method Array queryData(Closure $callback = null)
* @method QueryList setData(Collection $data)
* @method QueryList encoding(string $outputEncoding,string $inputEncoding = null)
* @method QueryList get($url,$args = null,$otherArgs = [])
* @method QueryList post($url,$args = null,$otherArgs = [])
* @method QueryList postJson($url,$args = null,$otherArgs = [])
* @method MultiRequestService multiGet($urls)
* @method MultiRequestService multiPost($urls)
* @method QueryList use($plugins,...$opt)
* @method QueryList pipe(Closure $callback = null)
*/
class QueryList
{
protected $query;
protected $kernel;
protected static $instance = null;
/**
* QueryList constructor.
*/
public function __construct()
{
$this->query = new Query($this);
$this->kernel = (new Kernel($this))->bootstrap();
Config::getInstance()->bootstrap($this);
}
public function __call($name, $arguments)
{
if(method_exists($this->query,$name)){
$result = $this->query->$name(...$arguments);
}else{
$result = $this->kernel->getService($name)->call($this,...$arguments);
}
return $result;
}
public static function __callStatic($name, $arguments)
{
$instance = new self();
return $instance->$name(...$arguments);
}
public function __destruct()
{
$this->destruct();
}
/**
* Get the QueryList single instance
*
* @return QueryList
*/
public static function getInstance()
{
self::$instance || self::$instance = new self();
return self::$instance;
}
/**
* Get the Config instance
* @return null|Config
*/
public static function config()
{
return Config::getInstance();
}
/**
* Destruction of resources
*/
public function destruct()
{
unset($this->query);
unset($this->kernel);
}
/**
* Destroy all documents
*/
public static function destructDocuments()
{
phpQuery::$documents = [];
}
/**
* Bind a custom method to the QueryList object
*
* @param string $name Invoking the name
* @param Closure $provide Called method
* @return $this
*/
public function bind(string $name,Closure $provide)
{
$this->kernel->bind($name,$provide);
return $this;
}
}

View File

@@ -0,0 +1,37 @@
<?php
/**
* Created by PhpStorm.
* User: Jaeger <JaegerCode@gmail.com>
* Date: 2017/9/20
* 编码转换服务
*/
namespace QL\Services;
use QL\QueryList;
class EncodeService
{
public static function convert(QueryList $ql,string $outputEncoding,string $inputEncoding = null)
{
$html = $ql->getHtml();
$inputEncoding || $inputEncoding = self::detect($html);
$html = iconv($inputEncoding,$outputEncoding.'//IGNORE',$html);
$ql->setHtml($html);
return $ql;
}
/**
* Attempts to detect the encoding
* @param $string
* @return bool|false|mixed|string
*/
public static function detect($string)
{
$charset=mb_detect_encoding($string, array('ASCII', 'GB2312', 'GBK', 'UTF-8'),true);
if(strtolower($charset)=='cp936')
$charset='GBK';
return $charset;
}
}

View File

@@ -0,0 +1,59 @@
<?php
/**
* Created by PhpStorm.
* User: Jaeger <JaegerCode@gmail.com>
* Date: 2017/9/22
*/
namespace QL\Services;
use GuzzleHttp\Cookie\CookieJar;
use Jaeger\GHttp;
use QL\QueryList;
class HttpService
{
protected static $cookieJar = null;
public static function getCookieJar()
{
if(self::$cookieJar == null)
{
self::$cookieJar = new CookieJar();
}
return self::$cookieJar;
}
public static function get(QueryList $ql,$url,$args = null,$otherArgs = [])
{
$otherArgs = array_merge([
'cookies' => self::getCookieJar(),
'verify' => false
],$otherArgs);
$html = GHttp::get($url,$args,$otherArgs);
$ql->setHtml($html);
return $ql;
}
public static function post(QueryList $ql,$url,$args = null,$otherArgs = [])
{
$otherArgs = array_merge([
'cookies' => self::getCookieJar(),
'verify' => false
],$otherArgs);
$html = GHttp::post($url,$args,$otherArgs);
$ql->setHtml($html);
return $ql;
}
public static function postJson(QueryList $ql,$url,$args = null,$otherArgs = [])
{
$otherArgs = array_merge([
'cookies' => self::getCookieJar(),
'verify' => false
],$otherArgs);
$html = GHttp::postJson($url,$args,$otherArgs);
$ql->setHtml($html);
return $ql;
}
}

View File

@@ -0,0 +1,66 @@
<?php
/**
* Created by PhpStorm.
* User: Jaeger <JaegerCode@gmail.com>
* Date: 18/12/10
* Time: 下午7:05
*/
namespace QL\Services;
use Jaeger\GHttp;
use Closure;
use GuzzleHttp\Psr7\Response;
use QL\QueryList;
use GuzzleHttp\Exception\RequestException;
/**
* Class MultiRequestService
* @package QL\Services
*
* @method MultiRequestService withHeaders($headers)
* @method MultiRequestService withOptions($options)
* @method MultiRequestService concurrency($concurrency)
*/
class MultiRequestService
{
protected $ql;
protected $multiRequest;
protected $method;
public function __construct(QueryList $ql,$method,$urls)
{
$this->ql = $ql;
$this->method = $method;
$this->multiRequest = GHttp::multiRequest($urls);
}
public function __call($name, $arguments)
{
$this->multiRequest = $this->multiRequest->$name(...$arguments);
return $this;
}
public function success(Closure $success)
{
$this->multiRequest = $this->multiRequest->success(function(Response $response, $index) use($success){
$this->ql->setHtml((String)$response->getBody());
$success($this->ql,$response, $index);
});
return $this;
}
public function error(Closure $error)
{
$this->multiRequest = $this->multiRequest->error(function(RequestException $reason, $index) use($error){
$error($this->ql,$reason, $index);
});
return $this;
}
public function send()
{
$this->multiRequest->{$this->method}();
}
}

View File

@@ -0,0 +1,26 @@
<?php
/**
* Created by PhpStorm.
* User: Jaeger <JaegerCode@gmail.com>
* Date: 2017/9/22
*/
namespace QL\Services;
use QL\QueryList;
class PluginService
{
public static function install(QueryList $queryList, $plugins, ...$opt)
{
if(is_array($plugins))
{
foreach ($plugins as $plugin) {
$plugin::install($queryList);
}
}else{
$plugins::install($queryList,...$opt);
}
return $queryList;
}
}

71
tests/Dom/FindTest.php Normal file
View File

@@ -0,0 +1,71 @@
<?php
/**
* Created by PhpStorm.
* User: x
* Date: 2018/12/10
* Time: 12:46 AM
*/
namespace Tests\Dom;
use QL\QueryList;
use Tests\TestCaseBase;
class FindTest extends TestCaseBase
{
protected $html;
protected $ql;
protected function setUp(): void
{
$this->html = $this->getSnippet('snippet-1');
$this->ql = QueryList::html($this->html);
}
/**
* @test
*/
public function find_first_dom_attr()
{
$img = [];
$img[] = $this->ql->find('img')->attr('src');
$img[] = $this->ql->find('img')->src;
$img[] = $this->ql->find('img:eq(0)')->src;
$img[] = $this->ql->find('img')->eq(0)->src;
$alt = $this->ql->find('img')->alt;
$abc = $this->ql->find('img')->abc;
$this->assertCount(1,array_unique($img));
$this->assertEquals($alt,'这是图片');
$this->assertEquals($abc,'这是一个自定义属性');
}
/**
* @test
*/
public function find_second_dom_attr()
{
$img2 = [];
$img2[] = $this->ql->find('img')->eq(1)->alt;
$img2[] = $this->ql->find('img:eq(1)')->alt;
$img2[] = $this->ql->find('.second_pic')->alt;
$this->assertCount(1,array_unique($img2));
}
/**
* @test
*/
public function find_dom_all_attr()
{
$imgAttr = $this->ql->find('img:eq(0)')->attr('*');
$linkAttr = $this->ql->find('a:eq(1)')->attr('*');
$this->assertCount(3,$imgAttr);
$this->assertCount(1,$linkAttr);
}
}

43
tests/Dom/RulesTest.php Normal file
View File

@@ -0,0 +1,43 @@
<?php
/**
* Created by PhpStorm.
* User: Jaeger <JaegerCode@gmail.com>
* Date: 18/12/12
* Time: 下午12:25
*/
namespace Tests\Dom;
use QL\QueryList;
use Tests\TestCaseBase;
use Tightenco\Collect\Support\Collection;
class RulesTest extends TestCaseBase
{
protected $html;
protected $ql;
protected function setUp(): void
{
$this->html = $this->getSnippet('snippet-2');
$this->ql = QueryList::html($this->html);
}
/**
* @test
*/
public function get_data_by_rules()
{
$rules = [
'a' => ['a','text'],
'img_src' => ['img','src'],
'img_alt' => ['img','alt']
];
$range = 'ul>li';
$data = QueryList::rules($rules)->range($range)->html($this->html)->query()->getData();
$this->assertInstanceOf(Collection::class,$data);
$this->assertCount(3,$data);
$this->assertEquals('http://querylist.com/2.jpg',$data[1]['img_src']);
}
}

103
tests/Feature/HttpTest.php Normal file
View File

@@ -0,0 +1,103 @@
<?php
/**
* Created by PhpStorm.
* User: x
* Date: 2018/12/10
* Time: 12:35 AM
*/
namespace Tests\Feature;
use GuzzleHttp\Handler\MockHandler;
use GuzzleHttp\Psr7\Response;
use QL\QueryList;
use Tests\TestCaseBase;
class HttpTest extends TestCaseBase
{
protected $urls;
protected function setUp(): void
{
$this->urls = [
'http://httpbin.org/get?name=php',
'http://httpbin.org/get?name=golang',
'http://httpbin.org/get?name=c++',
'http://httpbin.org/get?name=java'
];
}
/**
* @test
*/
public function can_post_json_data()
{
$mock = new MockHandler([new Response()]);
$data = [
'name' => 'foo'
];
QueryList::postJson('http://foo.com',$data,[
'handler' => $mock
]);
$this->assertEquals((string)$mock->getLastRequest()->getBody(),json_encode($data));
}
/**
* @test
*/
public function concurrent_requests_base_use()
{
$urls = $this->urls;
QueryList::getInstance()
->multiGet($urls)
->success(function(QueryList $ql,Response $response, $index) use($urls){
$body = json_decode((string)$response->getBody(),true);
$this->assertEquals($urls[$index],$body['url']);
})->send();
}
/**
* @test
*/
public function concurrent_requests_advanced_use()
{
$ua = 'QueryList/4.0';
$errorUrl = 'http://web-site-not-exist.com';
$urls = array_merge($this->urls,[$errorUrl]);
QueryList::rules([])
->multiGet($urls)
->concurrency(2)
->withOptions([
'timeout' => 60
])
->withHeaders([
'User-Agent' => $ua
])
->success(function (QueryList $ql, Response $response, $index) use($ua){
$body = json_decode((string)$response->getBody(),true);
$this->assertEquals($ua,$body['headers']['User-Agent']);
})
->error(function (QueryList $ql, $reason, $index) use($urls,$errorUrl){
$this->assertEquals($urls[$index],$errorUrl);
})
->send();
}
/**
* @test
*/
public function request_with_cache()
{
$url = $this->urls[0];
$data = QueryList::get($url,null,[
'cache' => sys_get_temp_dir(),
'cache_ttl' => 600
])->getHtml();
$data = json_decode($data,true);
$this->assertEquals($url,$data['url']);
}
}

View File

@@ -0,0 +1,48 @@
<?php
/**
* Created by PhpStorm.
* User: x
* Date: 2018/12/9
* Time: 11:10 PM
*/
namespace Tests\Feature;
use QL\QueryList;
use Tests\TestCaseBase;
class InstanceTest extends TestCaseBase
{
protected $html;
protected function setUp(): void
{
$this->html = $this->getSnippet('snippet-1');
}
/**
* @test
*/
public function singleton_instance_mode()
{
$ql = QueryList::getInstance()->html($this->html);
$ql2 = QueryList::getInstance();
$this->assertEquals($ql->getHtml(),$ql2->getHtml());
}
/**
* @test
*/
public function get_new_object()
{
$ql = (new QueryList())->html($this->html);
$ql2 = (new QueryList())->html('');
$this->assertNotEquals($ql->getHtml(),$ql2->getHtml());
$ql = QueryList::range('')->html($this->html);
$ql2 = QueryList::range('')->html('');
$this->assertNotEquals($ql->getHtml(),$ql2->getHtml());
}
}

View File

@@ -0,0 +1,36 @@
<?php
/**
* Created by PhpStorm.
* User: x
* Date: 2018/12/10
* Time: 1:14 AM
*/
namespace Tests\Feature;
use QL\QueryList;
use Tests\TestCaseBase;
class MethodTest extends TestCaseBase
{
protected $html;
protected function setUp(): void
{
$this->html = $this->getSnippet('snippet-1');
}
/**
* @test
*/
public function pipe()
{
$html = $this->html;
$qlHtml = QueryList::pipe(function(QueryList $ql) use($html){
$ql->setHtml($html);
return $ql;
})->getHtml(false);
$this->assertEquals($html,$qlHtml);
}
}

20
tests/TestCaseBase.php Normal file
View File

@@ -0,0 +1,20 @@
<?php
/**
* Created by PhpStorm.
* User: x
* Date: 2018/12/9
* Time: 11:43 PM
*/
namespace Tests;
use PHPUnit\Framework\TestCase;
class TestCaseBase extends TestCase
{
public function getSnippet($name)
{
return file_get_contents(__DIR__.'/assets/'.$name.'.html');
}
}

View File

@@ -0,0 +1,9 @@
<div id="one">
<div class="two">
<a href="http://querylist.cc">QueryList官网</a>
<img src="http://querylist.com/1.jpg" alt="这是图片" abc="这是一个自定义属性">
<img class="second_pic" src="http://querylist.com/2.jpg" alt="这是图片2">
<a href="http://doc.querylist.cc">QueryList文档</a>
</div>
<span>其它的<b>一些</b>文本</span>
</div>

View File

@@ -0,0 +1,16 @@
<div id="one">
<ul>
<li>
<a href="http://querylist.cc">QueryList官网</a>
<img src="http://querylist.com/1.jpg" alt="这是图片1" abc="这是一个自定义属性1">
</li>
<li>
<a href="http://v3.querylist.cc">QueryList V3文档</a>
<img src="http://querylist.com/2.jpg" alt="这是图片2" abc="这是一个自定义属性2">
</li>
<li>
<a href="http://v4.querylist.cc">QueryList V4文档</a>
<img src="http://querylist.com/3.jpg" alt="这是图片3" abc="这是一个自定义属性3">
</li>
</ul>
</div>

5
tests/bootstrap.php Normal file
View File

@@ -0,0 +1,5 @@
<?php
set_time_limit(0);
require __DIR__.'/../vendor/autoload.php';