128 Commits

Author SHA1 Message Date
Jaeger(黄杰)
894fb4344e Merge pull request #145 from maxiaozhi/master
正则匹配成功时才替换掉html
2021-08-08 13:04:55 +08:00
lion
e4fc716acd 正则匹配成功时才替换掉html 2021-07-18 23:37:35 +08:00
Jaeger(黄杰)
39dc0ca9c6 Merge pull request #143 from maxiaozhi/patch-1
Fix the matching exception
2021-07-05 14:07:58 +08:00
maxiaozhi
ef0a2efd4f Fix the matching exception
Fix the matching exception when the page contains multiple tags prefixed with head (for example: < head >, < header >)
2021-07-05 13:51:24 +08:00
huangjie
5953daac54 update collect 2020-12-14 10:39:28 +08:00
huangjie
465c6aefc7 update collect 2020-09-27 17:41:44 +08:00
Jaeger(黄杰)
92cb319d44 Update README-ZH.md 2020-07-18 13:06:29 +08:00
Jaeger(黄杰)
cbf3e0fcad Update README.md 2020-07-18 13:05:59 +08:00
Jaeger(黄杰)
cfa2d94a79 Update FUNDING.yml 2020-07-17 13:20:49 +08:00
Jaeger(黄杰)
47a444bf9e Create FUNDING.yml 2020-07-17 13:08:44 +08:00
Jaeger
85903fa9b5 feat: rules add attrs 2020-04-03 20:16:00 +08:00
Jaeger(黄杰)
e527c637c7 Merge pull request #110 from jae-jae/develop
replace collect()
2020-04-03 04:55:27 -05:00
Jaeger
f0a9798925 replace collect() 2020-04-03 17:33:32 +08:00
Jaeger
faea883c6f fix: data callback 2020-04-01 22:03:50 +08:00
Jaeger
c16826a573 updaed composer dependency 2020-03-23 18:15:04 +08:00
Jaeger
1492751f98 feat: optimization getHtml() 2020-03-22 17:19:57 +08:00
Jaeger
b7954b9aef fix: memory overflow 2020-03-20 13:26:40 +08:00
Jaeger
b3d84cf057 feat: modify the each function of class elements 2020-03-15 14:17:18 +08:00
Jaeger
52bbdeae14 Merge branch 'master' of github.com:jae-jae/QueryList into develop 2020-03-15 14:07:52 +08:00
Jaeger(黄杰)
25b2dbdc86 Merge pull request #105 from edwinhuish/add-each-function-same-as-collection
添加 each function 并和 Collection 保持一致,返回 false 时中断循环。
2020-03-15 01:07:22 -05:00
Jaeger
02c2b125d8 feat: elements class add htmlOuters function 2020-03-15 13:58:00 +08:00
Jaeger
fc8b701ef2 feat: optimize range results 2020-03-15 13:45:00 +08:00
Jaeger
75e436c73f feat: merge master 2020-03-15 11:30:35 +08:00
Jaeger(黄杰)
aa90e5a21d Merge pull request #106 from edwinhuish/destroy-old-phpquey-object-when-setHtml
destroy old phpquery object when setHtml
2020-03-14 22:28:13 -05:00
Jaeger
dd9af6881d feat: rules add texts and htmls attribute 2020-03-13 21:42:25 +08:00
Jaeger
b07d4bfc74 feat: rules add texts and htmls attribute 2020-03-13 21:39:42 +08:00
Edwin Xu
8c1614c4c3 destroy old phpquery object when setHtml 2020-03-13 16:08:55 +08:00
Jaeger
b387ef5bb0 feat: rules add htmlOuter attribute 2020-03-13 15:16:44 +08:00
Edwin Xu
67f0052c5d 添加 each function 并和 Collection 保持一致,返回 false 时中断循环。 2020-03-13 14:20:37 +08:00
Jaeger
7c86f82527 fix: optimize memory usage 2020-03-13 13:49:36 +08:00
Jaeger(黄杰)
6ee6a26aee Merge pull request #102 from edwinhuish/auto-destroy-phpquery-document
destroy phpquery document object when destruct Query class
2020-03-11 10:29:31 -05:00
Jaeger(黄杰)
116f19da65 Merge pull request #104 from edwinhuish/add-phpdoc
fix phpdoc
2020-03-11 10:20:22 -05:00
Edwin Xu
67cbd0f473 修复phpdoc 2020-03-10 21:36:55 +08:00
Edwin Xu
3eb26451c6 修复phpdoc 2020-03-10 21:03:25 +08:00
Edwin Xu
a76ecb4258 destroy phpquery document object when destruct Query class 2020-03-05 22:27:27 +08:00
Jaeger
46f564bc8b Updated phpQuery 2019-02-22 15:33:54 +08:00
Jaeger
df9e3bbf19 test htpp cache 2018-12-12 15:29:31 +08:00
Jaeger
0c85eed7ef add multiGet and multiPost 2018-12-11 17:52:41 +08:00
Jaeger
df521923ac Concurrent requests 2018-12-11 00:00:17 +08:00
Jaeger
a779ef71f3 add MultiRequest 2018-12-10 19:23:15 +08:00
Jaeger
c32736bd9e add pipe 2018-12-10 01:27:48 +08:00
Jaeger
661bc3168d add phpunit 2018-12-10 00:13:16 +08:00
Jaeger
6d182ff061 remove instance 2018-12-07 00:35:58 +08:00
Jaeger
1c2e3f4adf add queryData() 2018-10-15 18:52:12 +08:00
Jaeger
1d73895981 single instance 2017-12-15 11:05:32 +08:00
Jaeger
03e6a955bf add https verify false 2017-12-14 10:31:47 +08:00
Jaeger
72a7543da3 fix laravel conflict bug 2017-11-15 10:46:51 +08:00
Jaeger
9d04003d73 fix laravel conflict bug 2017-11-15 10:43:28 +08:00
Jaeger
31ec950cdc ok 2017-10-09 11:27:08 +08:00
Jaeger
18bc6daea4 ok 2017-10-09 02:44:07 +08:00
Jaeger
f2c6ce7385 add comments 2017-10-09 01:48:56 +08:00
Jaeger
c0ed870dc8 ok 2017-10-08 23:01:22 +08:00
Jaeger
a4d0087e47 update README 2017-10-08 22:48:06 +08:00
Jaeger
a0f7b9aa3e ok 2017-10-02 10:30:24 +08:00
Jaeger
d812c47ede update 2017-10-01 23:37:09 +08:00
Jaeger
47c0f37233 update README 2017-10-01 12:49:01 +08:00
Jaeger
967ef10f23 ok 2017-10-01 01:14:32 +08:00
Jaeger
c82eb3c557 ok 2017-10-01 01:13:39 +08:00
Jaeger
f68cc2e218 add EN README 2017-10-01 01:11:47 +08:00
Jaeger
684e52c70e ok 2017-10-01 00:23:34 +08:00
Jaeger
777d837f18 update README 2017-09-30 21:49:07 +08:00
Jaeger
6e9a202ac2 update README 2017-09-30 21:46:31 +08:00
Jaeger
e885eece26 ok 2017-09-30 12:09:51 +08:00
Jaeger
aeeec5367e ok 2017-09-30 12:04:27 +08:00
Jaeger
c42a7b1766 ok 2017-09-30 12:02:25 +08:00
Jaeger
a3a830a744 add logo 2017-09-30 12:01:15 +08:00
Jaeger
7381ec21d3 update REMADE 2017-09-30 11:32:09 +08:00
Jaeger
95102a5ce2 ok 2017-09-30 01:41:09 +08:00
Jaeger
520195c929 update COMMUNITY 2017-09-30 01:39:16 +08:00
Jaeger
75799decc3 add COMMUNITY 2017-09-30 01:12:00 +08:00
Jaeger
33c574cdb9 ok 2017-09-29 23:47:35 +08:00
Jaeger
47a777789b ok 2017-09-29 18:43:24 +08:00
Jaeger
ad8ce44572 update README 2017-09-29 15:07:25 +08:00
Jaeger
1c54d63993 update README 2017-09-29 15:05:33 +08:00
Jaeger
59d48911fd update README 2017-09-29 14:59:16 +08:00
Jaeger
5ed0921d17 ok 2017-09-29 12:18:23 +08:00
Jaeger
fcdc5a16db ok 2017-09-29 12:16:50 +08:00
Jaeger
a8a438edbe update README 2017-09-29 00:36:02 +08:00
Jaeger
bd58352117 update http plugin 2017-09-26 17:54:29 +08:00
Jaeger
c3f8a48357 update config 2017-09-25 14:36:21 +08:00
Jaeger
006e24a117 fix bug 2017-09-25 14:15:26 +08:00
Jaeger
042993311f add getData 2017-09-24 15:11:44 +08:00
Jaeger
b6c21b653a V4 is coming 2017-09-22 22:37:25 +08:00
Jaeger
5422168c98 add plugin 2017-09-22 19:09:43 +08:00
Jaeger
624f071a0d fix bug 2017-09-22 12:05:29 +08:00
Jaeger
042c10cdea add Http service 2017-09-22 02:38:46 +08:00
Jaeger
2013e4d2b0 add Query 2017-09-22 01:51:46 +08:00
Jaeger
ad9b493fc0 add encoding service 2017-09-21 13:12:20 +08:00
Jaeger
43d8f71678 add service provider 2017-09-21 02:20:28 +08:00
Jaeger
02fe5a7f9e ok 2017-09-21 01:44:03 +08:00
Jaeger
8bd07f5fbb ok 2017-09-20 01:12:54 +08:00
Jaeger
02c4c93ee5 add query() 2017-09-19 19:06:16 +08:00
Jaeger
0fafaafa7b update README 2017-09-19 18:00:33 +08:00
Jaeger
fe749f08c2 add Dom 2017-09-19 17:48:48 +08:00
Jaeger
e3576ce407 start V4 2017-09-19 02:33:38 +08:00
Jaeger
1a7864dcf8 V3.2.1 2017-06-09 12:25:07 +08:00
Jaeger
5cc049992b V3.1.3 2017-06-09 12:23:43 +08:00
Jaeger
967f2d95cd fix bug 2017-06-09 12:21:49 +08:00
Jaeger
7f6b6e279e update composer 2017-06-09 12:13:17 +08:00
Jaeger
198385e336 Merge pull request #4 from baijunyao/master
Using version 0.9.7 of phpQuery-single
2017-06-09 12:10:55 +08:00
白俊遥
26d6cf5e43 Using version 0.9.7 of phpQuery-single
使用0.9.7版本的phpQuery-single以解决phpQuery.php文件中因写错hltml造成的错误;
2017-06-08 15:07:56 +08:00
Jaeger
700d56db49 Merge branch 'feature/log' into develop 2017-04-20 13:53:32 +08:00
Jaeger
1691ddf3ee log ok 2017-04-20 13:53:03 +08:00
Jaeger
cbae16c6a4 添加日志功能 2017-04-19 18:30:22 +08:00
Jaeger
66c4ef8c4f Merge pull request #2 from bryant1410/master
Fix broken headings in Markdown files
2017-04-17 14:38:35 +08:00
Jaeger
330c71778f Merge pull request #1 from han8gui/master
添加ua
2017-04-17 14:37:14 +08:00
Santiago Castro
1185ad399f Fix broken Markdown headings 2017-04-16 21:15:10 -03:00
han8gui
b3290d2484 添加ua 2017-02-09 10:51:09 +08:00
Jaeger
8e4cf456f2 update readme 2016-12-22 16:42:44 +08:00
Jaeger
f006e751ef update readme 2016-12-22 16:39:48 +08:00
Jaeger
64884ee72f update readme 2016-12-22 16:34:17 +08:00
Jaeger
777738adc3 update readme 2016-12-22 16:30:30 +08:00
JAE
4a003e5490 update readme 2016-01-14 11:29:09 +08:00
Jaeger
fbea1aaa94 update 2016-01-06 15:56:03 +08:00
x
c63ea6421c merge 2015-12-29 21:52:11 +08:00
x
ba6e6fb4c8 Merge branch 'dev' 2015-12-29 21:49:35 +08:00
x
8bbb3f3171 update var name 2015-12-29 21:45:55 +08:00
JAE
2b0c62489f update composer 2015-12-29 09:41:14 +08:00
JAE
6935f4b178 Merge branch 'master' of git.oschina.net:jae/QueryList 2015-12-28 15:45:44 +08:00
JAE
40a91e77b0 composer ok 2015-12-28 15:32:11 +08:00
JAE
121f1190d9 composer ok 2015-12-28 15:29:48 +08:00
JAE
67385ffa4b update composer 2015-12-28 15:13:57 +08:00
x
6ea1f5bb82 update 2015-12-27 23:38:47 +08:00
JAE
a3f12f9464 update 2015-12-25 17:51:52 +08:00
x
c334b361a6 update namespace 2015-12-24 00:33:52 +08:00
JAE
e1058b8201 update composer 2015-12-23 18:11:45 +08:00
JAE
b820f2bd53 add composer 2015-12-22 18:10:14 +08:00
x
5ac4bfe0d0 update readme 2015-12-20 23:56:40 +08:00
41 changed files with 2303 additions and 7940 deletions

12
.github/FUNDING.yml vendored Normal file
View File

@@ -0,0 +1,12 @@
# These are supported funding model platforms
github: # Replace with up to 4 GitHub Sponsors-enabled usernames e.g., [user1, user2]
patreon: # Replace with a single Patreon username
open_collective: querylist # Replace with a single Open Collective username
ko_fi: # Replace with a single Ko-fi username
tidelift: # Replace with a single Tidelift platform-name/package-name e.g., npm/babel
community_bridge: # Replace with a single Community Bridge project-name e.g., cloud-foundry
liberapay: # Replace with a single Liberapay username
issuehunt: # Replace with a single IssueHunt username
otechie: # Replace with a single Otechie username
custom: # Replace with up to 4 custom sponsorship URLs e.g., ['link1', 'link2']

5
.gitignore vendored Normal file
View File

@@ -0,0 +1,5 @@
/vendor/
.idea/
composer.lock
.DS_Store
*.cache

View File

@@ -1,422 +0,0 @@
<?php
/**
* QueryList
*
* 一个基于phpQuery的通用列表采集类
*
* @author Jaeger
* @email 734708094@qq.com
* @link http://git.oschina.net/jae/QueryList
* @version 3.0.1
*
* @example
*
//获取CSDN移动开发栏目下的文章列表标题
$hj = QueryList::Query('http://mobile.csdn.net/',array("title"=>array('.unit h1','text')));
print_r($hj->data);
//回调函数1
function callfun1($content,$key)
{
return '回调函数1'.$key.'-'.$content;
}
class HJ{
//回调函数2
static public function callfun2($content,$key)
{
return '回调函数2'.$key.'-'.$content;
}
}
//获取CSDN文章页下面的文章标题和内容
$url = 'http://www.csdn.net/article/2014-06-05/2820091-build-or-buy-a-mobile-game-backend';
$reg = array(
'title'=>array('h1','text','','callfun1'), //获取纯文本格式的标题,并调用回调函数1
'summary'=>array('.summary','text','-input strong'), //获取纯文本的文章摘要但保strong标签并去除input标签
'content'=>array('.news_content','html','div a -.copyright'), //获取html格式的文章内容但过滤掉div和a标签,去除类名为copyright的元素
'callback'=>array('HJ','callfun2') //调用回调函数2作为全局回调函数
);
$rang = '.left';
$hj = QueryList::Query($url,$reg,$rang);
print_r($hj->data);
//继续获取右边相关热门文章列表的标题以及链接地址
$hj->setQuery(array('title'=>array('','text'),'url'=>array('a','href')),'#con_two_2 li');
//输出数据
echo $hj->getData();
*/
require 'phpQuery/phpQuery.php';
class QueryList
{
private $regArr;
public $data;
private $regRange;
public $html;
private $pqHtml;
private $outputEncoding = false;
private $inputEncoding = false;
private $htmlEncoding;
public static $instances;
public function __construct() {
}
/**
* 静态方法,访问入口
* @param string $page 要抓取的网页URL地址(支持https);或者是html源代码
* @param array $regArr 【选择器数组】说明格式array("名称"=>array("选择器","类型"[,"标签过滤列表"][,"回调函数"]),.......[,"callback"=>"全局回调函数"]);
* 【选择器】说明:可以为任意的jQuery选择器语法
* 【类型】说明:值 "text" ,"html" ,"HTML标签属性" ,
* 【标签过滤列表】:可选,当标签名前面添加减号(-)时此时标签可以为任意的元素选择器表示移除该标签以及标签内容否则当【类型】值为text时表示需要保留的HTML标签为html时表示要过滤掉的HTML标签
* 【回调函数】/【全局回调函数】:可选,字符串(函数名) 或 数组array("类名","类的静态方法")),回调函数应有俩个参数,第一个参数是选择到的内容,第二个参数是选择器数组下标,回调函数会覆盖全局回调函数
*
* @param string $regRange 【块选择器】:指 先按照规则 选出 几个大块 ,然后再分别再在块里面 进行相关的选择
* @param string $outputEncoding【输出编码格式】指要以什么编码输出(UTF-8,GB2312,.....),防止出现乱码,如果设置为 假值 则不改变原字符串编码
* @param string $inputEncoding 【输入编码格式】明确指定输入的页面编码格式(UTF-8,GB2312,.....),防止出现乱码,如果设置为 假值 则自动识别
* @param bool|false $removeHead 【是否移除页面头部区域】 乱码终极解决方案
* @return mixed
*/
public static function Query($page,array $regArr, $regRange = '', $outputEncoding = null, $inputEncoding = null,$removeHead = false)
{
return self::getInstance()->_query($page, $regArr, $regRange, $outputEncoding, $inputEncoding,$removeHead);
}
/**
* 运行QueryList扩展
* @param $class
* @param array $args
* @return mixed
* @throws QueryList_Exception
*/
public static function run($class,$args = array())
{
$extension = self::getInstance($class);
return $extension->run($args);
}
/**
* 获取任意实例
* @return mixed
* @throws QueryList_Exception
*/
public static function getInstance()
{
$args = func_get_args();
count($args) || $args = array('QueryList');
$key = md5(serialize($args));
$className = array_shift($args);
if(!class_exists($className)) {
throw new QueryList_Exception("no class {$className}");
}
if(!isset(self::$instances[$key])) {
$rc = new ReflectionClass($className);
self::$instances[$key] = $rc->newInstanceArgs($args);
}
return self::$instances[$key];
}
/**
* 获取目标页面源码(主要用于调试)
* @param bool|true $rel
* @return string
*/
public function getHtml($rel = true)
{
return $rel?$this->qpHtml:$this->html;
}
/**
* 获取采集结果数据
* @param callback $callback
* @return array
*/
public function getData($callback = null)
{
if(is_callable($callback)){
return array_map($callback,$this->data);
}
return $this->data;
}
/**
* 重新设置选择器
* @param $regArr
* @param string $regRange
* @param string $outputEncoding
* @param string $inputEncoding
* @param bool|false $removeHead
* @return QueryList
*/
public function setQuery(array $regArr, $regRange = '',$outputEncoding = null, $inputEncoding = null,$removeHead = false)
{
return $this->_query($this->html,$regArr, $regRange, $outputEncoding, $inputEncoding,$removeHead);
}
private function _query($page,array $regArr, $regRange, $outputEncoding, $inputEncoding,$removeHead)
{
$this->data = array();
$this->html = $this->_isURL($page)?$this->_request($page):$page;
$outputEncoding && $this->outputEncoding = $outputEncoding;
$inputEncoding && $this->inputEncoding = $inputEncoding;
$removeHead && $this->html = $this->_removeHead($this->html);
$this->pqHtml = '';
if(empty($this->html)){
trigger_error("The received content is empty!",E_USER_NOTICE);
}
//获取编码格式
$this->htmlEncoding = $this->inputEncoding?$this->inputEncoding:$this->_getEncode($this->html);
// $this->html = $this->_removeTags($this->html,array('script','style'));
$this->regArr = $regArr;
$this->regRange = $regRange;
$this->_getList();
return $this;
}
private function _getList()
{
$this->inputEncoding && phpQuery::$defaultCharset = $this->inputEncoding;
$document = phpQuery::newDocumentHTML($this->html);
$this->qpHtml = $document->htmlOuter();
if (!empty($this->regRange)) {
$robj = pq($document)->find($this->regRange);
$i = 0;
foreach ($robj as $item) {
while (list($key, $reg_value) = each($this->regArr)) {
if($key=='callback')continue;
$tags = isset($reg_value[2])?$reg_value[2]:'';
$iobj = pq($item)->find($reg_value[0]);
switch ($reg_value[1]) {
case 'text':
$this->data[$i][$key] = $this->_allowTags(pq($iobj)->html(),$tags);
break;
case 'html':
$this->data[$i][$key] = $this->_stripTags(pq($iobj)->html(),$tags);
break;
default:
$this->data[$i][$key] = pq($iobj)->attr($reg_value[1]);
break;
}
if(isset($reg_value[3])){
$this->data[$i][$key] = call_user_func($reg_value[3],$this->data[$i][$key],$key);
}else if(isset($this->regArr['callback'])){
$this->data[$i][$key] = call_user_func($this->regArr['callback'],$this->data[$i][$key],$key);
}
}
//重置数组指针
reset($this->regArr);
$i++;
}
} else {
while (list($key, $reg_value) = each($this->regArr)) {
if($key=='callback')continue;
$document = phpQuery::newDocumentHTML($this->html);
$tags = isset($reg_value[2])?$reg_value[2]:'';
$lobj = pq($document)->find($reg_value[0]);
$i = 0;
foreach ($lobj as $item) {
switch ($reg_value[1]) {
case 'text':
$this->data[$i][$key] = $this->_allowTags(pq($item)->html(),$tags);
break;
case 'html':
$this->data[$i][$key] = $this->_stripTags(pq($item)->html(),$tags);
break;
default:
$this->data[$i][$key] = pq($item)->attr($reg_value[1]);
break;
}
if(isset($reg_value[3])){
$this->data[$i][$key] = call_user_func($reg_value[3],$this->data[$i][$key],$key);
}else if(isset($this->regArr['callback'])){
$this->data[$i][$key] = call_user_func($this->regArr['callback'],$this->data[$i][$key],$key);
}
$i++;
}
}
}
if ($this->outputEncoding) {
//编码转换
$this->data = $this->_arrayConvertEncoding($this->data, $this->outputEncoding, $this->htmlEncoding);
}
phpQuery::$documents = array();
}
/**
* URL请求
* @param $url
* @return string
*/
private function _request($url)
{
if(function_exists('curl_init')){
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false);
curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, false);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);
curl_setopt($ch, CURLOPT_AUTOREFERER, true);
curl_setopt($ch, CURLOPT_REFERER, $url);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
$result = curl_exec($ch);
curl_close($ch);
}elseif(version_compare(PHP_VERSION, '5.0.0')>=0){
$opts = array(
'http' => array(
'header' => "Referer:{$url}"
)
);
$result = file_get_contents($url,false,stream_context_create($opts));
}else{
$result = file_get_contents($url);
}
return $result;
}
/**
* 移除页面head区域代码
* @param $html
* @return mixed
*/
private function _removeHead($html)
{
return preg_replace('/<head.+?>.+<\/head>/is','<head></head>',$html);
}
/**
* 获取文件编码
* @param $string
* @return string
*/
private function _getEncode($string)
{
return mb_detect_encoding($string, array('ASCII', 'GB2312', 'GBK', 'UTF-8'));
}
/**
* 转换数组值的编码格式
* @param array $arr
* @param string $toEncoding
* @param string $fromEncoding
* @return array
*/
private function _arrayConvertEncoding($arr, $toEncoding, $fromEncoding)
{
eval('$arr = '.iconv($fromEncoding, $toEncoding.'//IGNORE', var_export($arr,TRUE)).';');
return $arr;
}
/**
* 简单的判断一下参数是否为一个URL链接
* @param string $str
* @return boolean
*/
private function _isURL($str)
{
if (preg_match('/^http(s)?:\\/\\/.+/', $str)) {
return true;
}
return false;
}
/**
* 去除特定的html标签
* @param string $html
* @param string $tags_str 多个标签名之间用空格隔开
* @return string
*/
private function _stripTags($html,$tags_str)
{
$tagsArr = $this->_tag($tags_str);
$html = $this->_removeTags($html,$tagsArr[1]);
$p = array();
foreach ($tagsArr[0] as $tag) {
$p[]="/(<(?:\/".$tag."|".$tag.")[^>]*>)/i";
}
$html = preg_replace($p,"",trim($html));
return $html;
}
/**
* 保留特定的html标签
* @param string $html
* @param string $tags_str 多个标签名之间用空格隔开
* @return string
*/
private function _allowTags($html,$tags_str)
{
$tagsArr = $this->_tag($tags_str);
$html = $this->_removeTags($html,$tagsArr[1]);
$allow = '';
foreach ($tagsArr[0] as $tag) {
$allow .= "<$tag> ";
}
return strip_tags(trim($html),$allow);
}
private function _tag($tags_str)
{
$tagArr = preg_split("/\s+/",$tags_str,-1,PREG_SPLIT_NO_EMPTY);
$tags = array(array(),array());
foreach($tagArr as $tag)
{
if(preg_match('/-(.+)/', $tag,$arr))
{
array_push($tags[1], $arr[1]);
}else{
array_push($tags[0], $tag);
}
}
return $tags;
}
/**
* 移除特定的html标签
* @param string $html
* @param array $tags 标签数组
* @return string
*/
private function _removeTags($html,$tags)
{
$tag_str = '';
if(count($tags))
{
foreach ($tags as $tag) {
$tag_str .= $tag_str?','.$tag:$tag;
}
phpQuery::$defaultCharset = $this->inputEncoding?$this->inputEncoding:$this->htmlEncoding;
$doc = phpQuery::newDocumentHTML($html);
pq($doc)->find($tag_str)->remove();
$html = pq($doc)->htmlOuter();
$doc->unloadDocument();
}
return $html;
}
}
class QueryList_Exception extends Exception{
}
class Autoload
{
public static function load($className)
{
$files = array(
sprintf('%s/extensions/%s.php',__DIR__,$className),
sprintf('%s/extensions/vendors/%s.php',__DIR__,$className)
);
foreach ($files as $file) {
if(is_file($file)){
require $file;
return true;
}
}
return false;
}
}
spl_autoload_register(array('Autoload','load'));

309
README-ZH.md Normal file
View File

@@ -0,0 +1,309 @@
<p align="center">
<img width="150" src="logo.png" alt="QueryList">
<br>
<br>
</p>
# QueryList 简介
`QueryList`是一套简洁、优雅、可扩展的PHP采集工具(爬虫)基于phpQuery。
## 特性
- 拥有与jQuery完全相同的CSS3 DOM选择器
- 拥有与jQuery完全相同的DOM操作API
- 拥有通用的列表采集方案
- 拥有强大的HTTP请求套件轻松实现如模拟登陆、伪造浏览器、HTTP代理等意复杂的网络请求
- 拥有乱码解决方案
- 拥有强大的内容过滤功能可使用jQuey选择器来过滤内容
- 拥有高度的模块化设计,扩展性强
- 拥有富有表现力的API
- 拥有高质量文档
- 拥有丰富的插件
- 拥有专业的问答社区和交流群
通过插件可以轻松实现诸如:
- 多线程采集
- 采集JavaScript动态渲染的页面 (PhantomJS/headless WebKit)
- 图片本地化
- 模拟浏览器行为提交Form表单
- 网络爬虫
- .....
## 环境要求
- PHP >= 7.1
> 如果你的PHP版本还停留在PHP5或者不会使用Composer,你可以选择使用QueryList3,QueryList3支持php5.3以及手动安装。
QueryList3 文档:http://v3.querylist.cc
## 安装
通过Composer安装:
```
composer require jaeger/querylist
```
## 使用
#### 元素操作
- 采集「昵图网」所有图片地址
```php
QueryList::get('http://www.nipic.com')->find('img')->attrs('src');
```
- 采集百度搜索结果
```php
$ql = QueryList::get('http://www.baidu.com/s?wd=QueryList');
$ql->find('title')->text(); // 获取网站标题
$ql->find('meta[name=keywords]')->content; // 获取网站头部关键词
$ql->find('h3>a')->texts(); //获取搜索结果标题列表
$ql->find('h3>a')->attrs('href'); //获取搜索结果链接列表
$ql->find('img')->src; //获取第一张图片的链接地址
$ql->find('img:eq(1)')->src; //获取第二张图片的链接地址
$ql->find('img')->eq(2)->src; //获取第三张图片的链接地址
// 遍历所有图片
$ql->find('img')->map(function($img){
echo $img->alt; //打印图片的alt属性
});
```
- 更多用法
```php
$ql->find('#head')->append('<div>追加内容</div>')->find('div')->htmls();
$ql->find('.two')->children('img')->attrs('alt'); //获取class为two元素下的所有img孩子节点
//遍历class为two元素下的所有孩子节点
$data = $ql->find('.two')->children()->map(function ($item){
//用is判断节点类型
if($item->is('a')){
return $item->text();
}elseif($item->is('img'))
{
return $item->alt;
}
});
$ql->find('a')->attr('href', 'newVal')->removeClass('className')->html('newHtml')->...
$ql->find('div > p')->add('div > ul')->filter(':has(a)')->find('p:first')->nextAll()->andSelf()->...
$ql->find('div.old')->replaceWith( $ql->find('div.new')->clone())->appendTo('.trash')->prepend('Deleted')->...
```
#### 列表采集
采集百度搜索结果列表的标题和链接:
```php
$data = QueryList::get('http://www.baidu.com/s?wd=QueryList')
// 设置采集规则
->rules([
'title'=>array('h3','text'),
'link'=>array('h3>a','href')
])
->query()->getData();
print_r($data->all());
```
采集结果:
```
Array
(
[0] => Array
(
[title] => QueryList|基于phpQuery的无比强大的PHP采集工具
[link] => http://www.baidu.com/link?url=GU_YbDT2IHk4ns1tjG2I8_vjmH0SCJEAPuuZN
)
[1] => Array
(
[title] => PHP 用QueryList抓取网页内容 - wb145230 - 博客园
[link] => http://www.baidu.com/link?url=zn0DXBnrvIF2ibRVW34KcRVFG1_bCdZvqvwIhUqiXaS
)
[2] => Array
(
[title] => 介绍- QueryList指导文档
[link] => http://www.baidu.com/link?url=pSypvMovqS4v2sWeQo5fDBJ4EoYhXYi0Lxx
)
//...
)
```
#### 编码转换
```php
// 输出编码:UTF-8,输入编码:GB2312
QueryList::get('https://top.etao.com')->encoding('UTF-8','GB2312')->find('a')->texts();
// 输出编码:UTF-8,输入编码:自动识别
QueryList::get('https://top.etao.com')->encoding('UTF-8')->find('a')->texts();
```
#### HTTP网络操作GuzzleHttp
- 携带cookie登录新浪微博
```php
//采集新浪微博需要登录才能访问的页面
$ql = QueryList::get('http://weibo.com','param1=testvalue & params2=somevalue',[
'headers' => [
//填写从浏览器获取到的cookie
'Cookie' => 'SINAGLOBAL=546064; wb_cmtLike_2112031=1; wvr=6;....'
]
]);
//echo $ql->getHtml();
echo $ql->find('title')->text();
//输出: 我的首页 微博-随时随地发现新鲜事
```
- 使用Http代理
```php
$urlParams = ['param1' => 'testvalue','params2' => 'somevalue'];
$opts = [
// 设置http代理
'proxy' => 'http://222.141.11.17:8118',
//设置超时时间,单位:秒
'timeout' => 30,
// 伪造http头
'headers' => [
'Referer' => 'https://querylist.cc/',
'User-Agent' => 'testing/1.0',
'Accept' => 'application/json',
'X-Foo' => ['Bar', 'Baz'],
'Cookie' => 'abc=111;xxx=222'
]
];
$ql->get('http://httpbin.org/get',$urlParams,$opts);
// echo $ql->getHtml();
```
- 模拟登录
```php
// 用post登录
$ql = QueryList::post('http://xxxx.com/login',[
'username' => 'admin',
'password' => '123456'
])->get('http://xxx.com/admin');
//采集需要登录才能访问的页面
$ql->get('http://xxx.com/admin/page');
//echo $ql->getHtml();
```
#### Form表单操作
模拟登陆GitHub
```php
// 获取QueryList实例
$ql = QueryList::getInstance();
//获取到登录表单
$form = $ql->get('https://github.com/login')->find('form');
//填写GitHub用户名和密码
$form->find('input[name=login]')->val('your github username or email');
$form->find('input[name=password]')->val('your github password');
//序列化表单数据
$fromData = $form->serializeArray();
$postData = [];
foreach ($fromData as $item) {
$postData[$item['name']] = $item['value'];
}
//提交登录表单
$actionUrl = 'https://github.com'.$form->attr('action');
$ql->post($actionUrl,$postData);
//判断登录是否成功
// echo $ql->getHtml();
$userName = $ql->find('.header-nav-current-user>.css-truncate-target')->text();
if($userName)
{
echo '登录成功!欢迎你:'.$userName;
}else{
echo '登录失败!';
}
```
#### Bind功能扩展
自定义扩展一个`myHttp`方法:
```php
$ql = QueryList::getInstance();
//绑定一个myHttp方法到QueryList对象
$ql->bind('myHttp',function ($url){
// $this 为当前的QueryList对象
$html = file_get_contents($url);
$this->setHtml($html);
return $this;
});
//然后就可以通过注册的名字来调用
$data = $ql->myHttp('https://toutiao.io')->find('h3 a')->texts();
print_r($data->all());
```
或者把实现体封装到class然后这样绑定:
```php
$ql->bind('myHttp',function ($url){
return new MyHttp($this,$url);
});
```
#### 插件使用
- 使用PhantomJS插件采集JavaScript动态渲染的页面:
```php
// 安装时设置PhantomJS二进制文件路径
$ql = QueryList::use(PhantomJs::class,'/usr/local/bin/phantomjs');
// 采集今日头条手机版
$data = $ql->browser('https://m.toutiao.com')->find('p')->texts();
print_r($data->all());
// 使用HTTP代理
$ql->browser('https://m.toutiao.com',false,[
'--proxy' => '192.168.1.42:8080',
'--proxy-type' => 'http'
])
```
- 使用CURL多线程插件,多线程采集GitHub排行榜:
```php
$ql = QueryList::use(CurlMulti::class);
$ql->curlMulti([
'https://github.com/trending/php',
'https://github.com/trending/go',
//.....more urls
])
// 每个任务成功完成调用此回调
->success(function (QueryList $ql,CurlMulti $curl,$r){
echo "Current url:{$r['info']['url']} \r\n";
$data = $ql->find('h3 a')->texts();
print_r($data->all());
})
// 每个任务失败回调
->error(function ($errorInfo,CurlMulti $curl){
echo "Current url:{$errorInfo['info']['url']} \r\n";
print_r($errorInfo['error']);
})
->start([
// 最大并发数
'maxThread' => 10,
// 错误重试次数
'maxTry' => 3,
]);
```
## 插件
- [jae-jae/QueryList-PhantomJS](https://github.com/jae-jae/QueryList-PhantomJS): 使用PhantomJS采集JavaScript动态渲染的页面
- [jae-jae/QueryList-CurlMulti](https://github.com/jae-jae/QueryList-CurlMulti) : Curl多线程采集
- [jae-jae/QueryList-AbsoluteUrl](https://github.com/jae-jae/QueryList-AbsoluteUrl) : 转换URL相对路径到绝对路径
- [jae-jae/QueryList-Rule-Google](https://github.com/jae-jae/QueryList-Rule-Google) : 谷歌搜索引擎
- [jae-jae/QueryList-Rule-Baidu](https://github.com/jae-jae/QueryList-Rule-Baidu) : 百度搜索引擎
查看更多的QueryList插件和基于QueryList的产品:[QueryList社区力量](https://github.com/jae-jae/QueryList-Community)
## 贡献
欢迎为QueryList贡献代码。关于贡献插件可以查看:[QueryList插件贡献说明](https://github.com/jae-jae/QueryList-Community/blob/master/CONTRIBUTING.md)
## 寻求帮助?
- QueryList主页: [http://querylist.cc](http://querylist.cc/)
- QueryList文档: [http://doc.querylist.cc](http://doc.querylist.cc/)
- QueryList问答:[http://wenda.querylist.cc](http://wenda.querylist.cc/)
- QueryList交流QQ群:123266961 <a target="_blank" href="http://shang.qq.com/wpa/qunwpa?idkey=a1b248ae30b3f711bdab4f799df839300dc7fed54331177035efa0513da027f6"><img border="0" src="http://pub.idqqimg.com/wpa/images/group.png" alt="cafeEX" title="cafeEX"></a>
- GitHub:https://github.com/jae-jae/QueryList
- Git@OSC:http://git.oschina.net/jae/QueryList
## Author
Jaeger <JaegerCode@gmail.com>
## Lisence
QueryList is licensed under the license of MIT. See the LICENSE for more details.

315
README.md
View File

@@ -1,11 +1,304 @@
#QueryList交流社区: [http://querylist.cc/](http://querylist.cc/)
#QueryList交流QQ群:123266961 <a target="_blank" href="http://shang.qq.com/wpa/qunwpa?idkey=a1b248ae30b3f711bdab4f799df839300dc7fed54331177035efa0513da027f6"><img border="0" src="http://pub.idqqimg.com/wpa/images/group.png" alt="╰☆邪恶 魔方☆" title="╰☆邪恶 魔方☆"></a>
#QueryList简介
***
QueryList是一个基于phpQuery的通用列表采集类,是一个简单、 灵活、强大的采集工具,采集任何复杂的页面 基本上就一句话就能搞定了。
#QueryList 使用
```php
//获取采集对象
$hj = QueryList::Query('http://www.baidu.com/s?wd=QueryList',array('title'=>array('h3','text'),'link'=>array('h3>a','href')));
<p align="center">
<img width="150" src="logo.png" alt="QueryList">
<br>
<br>
</p>
# QueryList
`QueryList` is a simple, elegant, extensible PHP Web Scraper (crawler/spider) ,based on phpQuery.
[API Documentation](https://github.com/jae-jae/QueryList/wiki)
[中文文档](README-ZH.md)
## Features
- Have the same CSS3 DOM selector as jQuery
- Have the same DOM manipulation API as jQuery
- Have a generic list crawling program
- Have a strong HTTP request suite, easy to achieve such as: simulated landing, forged browser, HTTP proxy and other complex network requests
- Have a messy code solution
- Have powerful content filtering, you can use the jQuey selector to filter content
- Has a high degree of modular design, scalability and strong
- Have an expressive API
- Has a wealth of plug-ins
Through plug-ins you can easily implement things like:
- Multithreaded crawl
- Crawl JavaScript dynamic rendering page (PhantomJS/headless WebKit)
- Image downloads to local
- Simulate browser behavior such as submitting Form forms
- Web crawler
- .....
## Requirements
- PHP >= 7.1
## Installation
By Composer installation:
```
composer require jaeger/querylist
```
## Usage
#### DOM Traversal and Manipulation
- Crawl「GitHub」all picture links
```php
QueryList::get('https://github.com')->find('img')->attrs('src');
```
- Crawl Google search results
```php
$ql = QueryList::get('https://www.google.co.jp/search?q=QueryList');
$ql->find('title')->text(); //The page title
$ql->find('meta[name=keywords]')->content; //The page keywords
$ql->find('h3>a')->texts(); //Get a list of search results titles
$ql->find('h3>a')->attrs('href'); //Get a list of search results links
$ql->find('img')->src; //Gets the link address of the first image
$ql->find('img:eq(1)')->src; //Gets the link address of the second image
$ql->find('img')->eq(2)->src; //Gets the link address of the third image
// Loop all the images
$ql->find('img')->map(function($img){
echo $img->alt; //Print the alt attribute of the image
});
```
- More usage
```php
$ql->find('#head')->append('<div>Append content</div>')->find('div')->htmls();
$ql->find('.two')->children('img')->attrs('alt'); // Get the class is the "two" element under all img child nodes
// Loop class is the "two" element under all child nodes
$data = $ql->find('.two')->children()->map(function ($item){
// Use "is" to determine the node type
if($item->is('a')){
return $item->text();
}elseif($item->is('img'))
{
return $item->alt;
}
});
$ql->find('a')->attr('href', 'newVal')->removeClass('className')->html('newHtml')->...
$ql->find('div > p')->add('div > ul')->filter(':has(a)')->find('p:first')->nextAll()->andSelf()->...
$ql->find('div.old')->replaceWith( $ql->find('div.new')->clone())->appendTo('.trash')->prepend('Deleted')->...
```
#### List crawl
Crawl the title and link of the Google search results list:
```php
$data = QueryList::get('https://www.google.co.jp/search?q=QueryList')
// Set the crawl rules
->rules([
'title'=>array('h3','text'),
'link'=>array('h3>a','href')
])
->query()->getData();
print_r($data->all());
```
Results:
```
Array
(
[0] => Array
(
[title] => Angular - QueryList
[link] => https://angular.io/api/core/QueryList
)
[1] => Array
(
[title] => QueryList | @angular/core - Angularリファレンス - Web Creative Park
[link] => http://www.webcreativepark.net/angular/querylist/
)
[2] => Array
(
[title] => QueryListにQueryを追加したり、追加されたことを感知する | TIPS ...
[link] => http://www.webcreativepark.net/angular/querylist_query_add_subscribe/
)
//...
)
```
#### Encode convert
```php
// Out charset :UTF-8
// In charset :GB2312
QueryList::get('https://top.etao.com')->encoding('UTF-8','GB2312')->find('a')->texts();
// Out charset:UTF-8
// In charset:Automatic Identification
QueryList::get('https://top.etao.com')->encoding('UTF-8')->find('a')->texts();
```
#### HTTP Client (GuzzleHttp)
- Carry cookie login GitHub
```php
//Crawl GitHub content
$ql = QueryList::get('https://github.com','param1=testvalue & params2=somevalue',[
'headers' => [
// Fill in the cookie from the browser
'Cookie' => 'SINAGLOBAL=546064; wb_cmtLike_2112031=1; wvr=6;....'
]
]);
//echo $ql->getHtml();
$userName = $ql->find('.header-nav-current-user>.css-truncate-target')->text();
echo $userName;
```
- Use the Http proxy
```php
$urlParams = ['param1' => 'testvalue','params2' => 'somevalue'];
$opts = [
// Set the http proxy
'proxy' => 'http://222.141.11.17:8118',
//Set the timeout time in seconds
'timeout' => 30,
// Fake HTTP headers
'headers' => [
'Referer' => 'https://querylist.cc/',
'User-Agent' => 'testing/1.0',
'Accept' => 'application/json',
'X-Foo' => ['Bar', 'Baz'],
'Cookie' => 'abc=111;xxx=222'
]
];
$ql->get('http://httpbin.org/get',$urlParams,$opts);
// echo $ql->getHtml();
```
- Analog login
```php
// Post login
$ql = QueryList::post('http://xxxx.com/login',[
'username' => 'admin',
'password' => '123456'
])->get('http://xxx.com/admin');
// Crawl pages that need to be logged in to access
$ql->get('http://xxx.com/admin/page');
//echo $ql->getHtml();
```
#### Submit forms
Login GitHub
```php
// Get the QueryList instance
$ql = QueryList::getInstance();
// Get the login form
$form = $ql->get('https://github.com/login')->find('form');
// Fill in the GitHub username and password
$form->find('input[name=login]')->val('your github username or email');
$form->find('input[name=password]')->val('your github password');
// Serialize the form data
$fromData = $form->serializeArray();
$postData = [];
foreach ($fromData as $item) {
$postData[$item['name']] = $item['value'];
}
// Submit the login form
$actionUrl = 'https://github.com'.$form->attr('action');
$ql->post($actionUrl,$postData);
// To determine whether the login is successful
// echo $ql->getHtml();
$userName = $ql->find('.header-nav-current-user>.css-truncate-target')->text();
if($userName)
{
echo 'Login successful ! Welcome:'.$userName;
}else{
echo 'Login failed !';
}
```
#### Bind function extension
Customize the extension of a `myHttp` method:
```php
$ql = QueryList::getInstance();
//Bind a `myHttp` method to the QueryList object
$ql->bind('myHttp',function ($url){
// $this is the current QueryList object
$html = file_get_contents($url);
$this->setHtml($html);
return $this;
});
// And then you can call by the name of the binding
$data = $ql->myHttp('https://toutiao.io')->find('h3 a')->texts();
print_r($data->all());
```
Or package to class, and then bind:
```php
$ql->bind('myHttp',function ($url){
return new MyHttp($this,$url);
});
```
#### Plugin used
- Use the PhantomJS plugin to crawl JavaScript dynamically rendered pages:
```php
// Set the PhantomJS binary file path during installation
$ql = QueryList::use(PhantomJs::class,'/usr/local/bin/phantomjs');
// Crawl「500px」all picture links
$data = $ql->browser('https://500px.com/editors')->find('img')->attrs('src');
print_r($data->all());
// Use the HTTP proxy
$ql->browser('https://500px.com/editors',false,[
'--proxy' => '192.168.1.42:8080',
'--proxy-type' => 'http'
])
```
- Using the CURL multithreading plug-in, multi-threaded crawling GitHub trending :
```php
$ql = QueryList::use(CurlMulti::class);
$ql->curlMulti([
'https://github.com/trending/php',
'https://github.com/trending/go',
//.....more urls
])
// Called if task is success
->success(function (QueryList $ql,CurlMulti $curl,$r){
echo "Current url:{$r['info']['url']} \r\n";
$data = $ql->find('h3 a')->texts();
print_r($data->all());
})
// Task fail callback
->error(function ($errorInfo,CurlMulti $curl){
echo "Current url:{$errorInfo['info']['url']} \r\n";
print_r($errorInfo['error']);
})
->start([
// Maximum number of threads
'maxThread' => 10,
// Number of error retries
'maxTry' => 3,
]);
```
## Plugins
- [jae-jae/QueryList-PhantomJS](https://github.com/jae-jae/QueryList-PhantomJS):Use PhantomJS to crawl Javascript dynamically rendered page.
- [jae-jae/QueryList-CurlMulti](https://github.com/jae-jae/QueryList-CurlMulti) : Curl multi threading.
- [jae-jae/QueryList-AbsoluteUrl](https://github.com/jae-jae/QueryList-AbsoluteUrl) : Converting relative urls to absolute.
- [jae-jae/QueryList-Rule-Google](https://github.com/jae-jae/QueryList-Rule-Google) : Google searcher.
- [jae-jae/QueryList-Rule-Baidu](https://github.com/jae-jae/QueryList-Rule-Baidu) : Baidu searcher.
View more QueryList plugins and QueryList-based products: [QueryList Community](https://github.com/jae-jae/QueryList-Community)
## Contributing
Welcome to contribute code for the QueryList。About Contributing Plugins can be viewed:[QueryList Plugin Contributing Guide](https://github.com/jae-jae/QueryList-Community/blob/master/CONTRIBUTING.md)
## Author
Jaeger <JaegerCode@gmail.com>
If this library is useful for you, say thanks [buying me a beer :beer:](https://www.paypal.me/jaepay)!
## Lisence
QueryList is licensed under the license of MIT. See the LICENSE for more details.

40
composer.json Normal file
View File

@@ -0,0 +1,40 @@
{
"name": "jaeger/querylist",
"description": "Simple, elegant, extensible PHP Web Scraper (crawler/spider),Use the css3 dom selector,Based on phpQuery! 简洁、优雅、可扩展的PHP采集工具(爬虫)基于phpQuery。",
"keywords":["QueryList","phpQuery","spider"],
"homepage": "http://querylist.cc",
"require": {
"PHP":">=7.1",
"jaeger/phpquery-single": "^1",
"jaeger/g-http": "^1.1",
"ext-dom": "*",
"tightenco/collect": ">5.0"
},
"suggest":{
},
"license": "MIT",
"authors": [
{
"name": "Jaeger",
"email": "JaegerCode@gmail.com"
}
],
"autoload":{
"psr-4":{
"QL\\":"src"
}
},
"autoload-dev": {
"psr-4": {
"Tests\\": "tests/"
}
},
"require-dev": {
"symfony/var-dumper": "^3.3",
"phpunit/phpunit": "^8.5"
},
"scripts": {
"test": "./vendor/bin/phpunit"
}
}

View File

@@ -1,19 +0,0 @@
<?php
/**
* @Author: Jaeger <hj.q@qq.com>
* @Date: 2015-11-11 17:52:40
* @Last Modified by: Jaeger
* @Last Modified time: 2015-11-16 09:57:56
* @version 1.0
* 扩展基类
*/
abstract class AQuery
{
abstract function run(array $args);
public function getInstance($className = 'QueryList', $params = null)
{
return QueryList::getInstance($className,$params);
}
}

View File

@@ -1,44 +0,0 @@
<?php
/**
* @Author: Jaeger <hj.q@qq.com>
* @Date: 2015-11-11 17:52:40
* @Last Modified by: Jaeger
* @Last Modified time: 2015-11-16 09:57:58
* @version 1.0
* 模拟登陆扩展
*/
class Login extends Request
{
private $http;
public $html;
public function run(array $args)
{
$this->http = $this->hq($args);
$this->html = $this->http->result;
return $this;
}
public function get($url,$callback = null,$args = null)
{
$result = $this->http->get($url);
return $this->getQL($result,$callback,$args);
}
public function post($url,$data=array(),$callback = null,$args = null)
{
$result = $this->http->post($url,$data);
return $this->getQL($result,$callback,$args);
}
private function getQL($html,$callback = null,$args = null)
{
if(is_callable($callback)){
$result = call_user_func($callback,$result,$args);
}
$ql = $this->getInstance();
$ql->html = $html;
return $ql;
}
}

View File

@@ -1,73 +0,0 @@
<?php
/**
* @Author: Jaeger <hj.q@qq.com>
* @Date: 2015-11-11 17:52:40
* @Last Modified by: Jaeger
* @Last Modified time: 2015-11-16 09:57:14
* @version 1.0
* 多线程扩展
*/
class Multi extends AQuery
{
public $curl;
private $args;
public function run(array $args)
{
$default = array(
'curl' => array(
'opt' => array(
CURLOPT_SSL_VERIFYPEER => false,
CURLOPT_SSL_VERIFYHOST => false,
CURLOPT_FOLLOWLOCATION => true,
CURLOPT_AUTOREFERER => true,
),
'maxThread' => 100,
'maxTry' => 3
),
'list' => array(),
'success' => function(){},
'error' => function(){},
'start' => true
);
$this->args = array_merge($default,$args);
$this->curl = $this->getInstance('CurlMulti');
if(isset($this->args['curl'])){
foreach ($this->args['curl'] as $k => $v) {
$this->curl->$k = $v;
}
}
$this->add($this->args['list']);
return $this->args['start']?$this->start():$this;
}
public function add($urls,$success = false,$error = false)
{
if(!is_array($urls)){
$urls = array($urls);
}
foreach ($urls as $url) {
$this->curl->add(
array(
'url' => $url,
'args' => $this,
'opt' => array(
CURLOPT_REFERER => $url
)
),
$success?$success:$this->args['success'],
$error?$error:$this->args['error']
);
}
return $this;
}
public function start()
{
$this->curl->start();
return $this;
}
}

View File

@@ -1,37 +0,0 @@
<?php
/**
* @Author: Jaeger <hj.q@qq.com>
* @Date: 2015-07-15 23:27:52
* @Last Modified by: Jaeger
* @Last Modified time: 2015-11-16 11:01:19
* @version 1.0
* 网络操作扩展
*/
class Request extends AQuery
{
protected function hq(array $args)
{
$args = array(
'http' => isset($args['http'])?$args['http']:$args,
'callback' => isset($args['callback'])?$args['callback']:'',
'args' => isset($args['args'])?$args['args']:''
);
$http = $this->getInstance('Http');
$http->initialize($args['http']);
$http->execute();
if(!empty($args['callback'])){
$http->result = call_user_func($args['callback'],$http->result,$args['args']);
}
return $http;
}
public function run(array $args)
{
$http = $this->hq($args);
$ql = $this->getInstance();
$ql->html = $http->result;
return $ql;
}
}

View File

@@ -1,698 +0,0 @@
<?php
/**
* Chrome Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.47 Safari/536.11
* IE6 Mozilla/5.0 (Windows NT 6.1; rv:9.0.1) Gecko/20100101 Firefox/9.0.1
* FF Mozilla/5.0 (Windows NT 6.1; WOW64; rv:24.0) Gecko/20100101 Firefox/24.0
*
* more useragent:http://www.useragentstring.com/
*
* @author admin@phpdr.net
*
*/
class CurlMulti {
// url
const TASK_ITEM_URL = 0x01;
// file
const TASK_ITEM_FILE = 0x02;
// arguments
const TASK_ITEM_ARGS = 0x03;
// operation, task level
const TASK_ITEM_OPT = 0x04;
// control options
const TASK_ITEM_CTL = 0x05;
// file pointer
const TASK_FP = 0x06;
// success callback
const TASK_PROCESS = 0x07;
// curl fail callback
const TASK_FAIL = 0x08;
// tryed times
const TASK_TRYED = 0x09;
// handler
const TASK_CH = 0x0A;
// global max thread num
public $maxThread = 10;
// Max thread by task type.Task type is specified in $item['ctl'] in add().If task has no type,$this->maxThreadNoType is maxThread-sum(maxThreadType).If less than 0 $this->maxThreadNoType is set to 0.
public $maxThreadType = array ();
// retry time(s) when task failed
public $maxTry = 3;
// operation, class level curl opt
public $opt = array ();
// cache options,dirLevel values is less than 3
public $cache = array (
'enable' => false,
'enableDownload' => false,
'compress' => false,
'dir' => null,
'expire' => 86400,
'dirLevel' => 1
);
// stack or queue
public $taskPoolType = 'stack';
// eliminate duplicate for taskpool, will delete previous task and add new one
public $taskOverride = false;
// task callback,add() should be called in callback, $cbTask[0] is callback, $cbTask[1] is param.
public $cbTask = null;
// status callback
public $cbInfo = null;
// user callback
public $cbUser = null;
// common fail callback, called if no one specified
public $cbFail = null;
// is the loop running
protected $isRunning = false;
// max thread num no type
protected $maxThreadNoType = null;
// all added task was saved here first
protected $taskPool = array ();
// taskPool with high priority
protected $taskPoolAhead = array ();
// running task(s)
protected $taskRunning = array ();
// failed task need to retry
protected $taskFail = array ();
// handle of multi-thread curl
private $mh = null;
// user error
private $userError = null;
// if __construct called
private $isConstructCalled = false;
// running info
private $info = array (
'all' => array (
// process start time
'startTime' => null,
// download start time
'startTimeDownload' => null,
// the real multi-thread num
'activeNum' => null,
// finished task in the queue
'queueNum' => null,
// byte
'downloadSize' => 0,
// finished task number,include failed task and cache
'finishNum' => 0,
// The number of cache used
'cacheNum' => 0,
// completely failed task number
'failNum' => 0,
// task num has added
'taskNum' => 0,
// task running num by type,
'taskRunningNumType' => array (),
// task ruuning num no type
'taskRunningNumNoType' => 0,
// $this->taskPool size
'taskPoolNum' => 0,
// $this->taskRunning size
'taskRunningNum' => 0,
// $this->taskFail size
'taskFailNum' => 0,
// finish percent
'finishPercent' => 0,
// time cost
'timeSpent' => 0,
// download time cost
'timeSpentDownload' => 0,
// curl task speed
'taskSpeedNoCache' => 0,
// network speed, bytes
'downloadSpeed' => 0
),
'running' => array ()
);
function __construct() {
$this->isConstructCalled = true;
if (version_compare ( PHP_VERSION, '5.1.0' ) < 0) {
throw new CurlMulti_Exception ( 'PHP 5.1.0+ is needed' );
}
}
/**
* add a task to taskPool
*
* @param array $item
* array('url'=>'',['file'=>'',['opt'=>array(),['args'=>array(),['ctl'=>array('type'=>'','ahead'=>false,'cache'=>array('enable'=>bool,'expire'=>0),'close'=>true))]]]])
* @param mixed $process
* success callback,for callback first param array('info'=>,'content'=>), second param $item[args]
* @param mixed $fail
* curl fail callback,for callback first param array('error'=>array(0=>code,1=>msg),'info'=>array),second param $item[args];
* @throws CurlMulti_Exception
* @return \frame\lib\CurlMulti
*/
function add(array $item, $process = null, $fail = null) {
// check
if (! is_array ( $item )) {
user_error ( 'item must be array, item is ' . gettype ( $item ), E_USER_WARNING );
} else {
$item ['url'] = trim ( $item ['url'] );
if (empty ( $item ['url'] )) {
user_error ( "url can't be empty, url=$item[url]", E_USER_WARNING );
} else {
// replace space with + to avoid some curl problems
$item ['url'] = str_replace ( ' ', '+', $item ['url'] );
// fix
if (empty ( $item ['file'] ))
$item ['file'] = null;
if (empty ( $item ['opt'] ))
$item ['opt'] = array ();
if (empty ( $item ['args'] ))
$item ['args'] = array ();
if (empty ( $item ['ctl'] )) {
$item ['ctl'] = array ();
}
if (! isset ( $item ['ctl'] ['cache'] ) || ! isset ( $item ['ctl'] ['cache'] ['enable'] )) {
$item ['ctl'] ['cache'] = array (
'enable' => false,
'expire' => 0
);
}
if (! isset ( $item ['ctl'] ['ahead'] )) {
$item ['ctl'] ['ahead'] = false;
}
if (empty ( $process )) {
$process = null;
}
if (empty ( $fail )) {
$fail = null;
}
$task = array ();
$task [self::TASK_ITEM_URL] = $item ['url'];
$task [self::TASK_ITEM_FILE] = $item ['file'];
$task [self::TASK_ITEM_ARGS] = array (
$item ['args']
);
$task [self::TASK_ITEM_OPT] = $item ['opt'];
$task [self::TASK_ITEM_CTL] = $item ['ctl'];
$task [self::TASK_PROCESS] = $process;
$task [self::TASK_FAIL] = $fail;
$task [self::TASK_TRYED] = 0;
$task [self::TASK_CH] = null;
$this->addTaskPool ( $task );
$this->info ['all'] ['taskNum'] ++;
}
}
return $this;
}
/**
* add task to taskPool
*
* @param unknown $task
*/
private function addTaskPool($task) {
// uniq
if ($this->taskOverride) {
foreach ( array (
'taskPoolAhead',
'taskPool'
) as $v ) {
foreach ( $this->$v as $k1 => $v1 ) {
if ($v1 [self::TASK_ITEM_URL] == $task [self::TASK_ITEM_URL]) {
$t = &$this->$v;
unset ( $t [$k1] );
}
}
}
}
// add
if (true == $task [self::TASK_ITEM_CTL] ['ahead']) {
$this->taskPoolAhead [] = $task;
} else {
if ($this->taskPoolType == 'queue') {
$this->taskPool [] = $task;
} elseif ($this->taskPoolType == 'stack') {
array_unshift ( $this->taskPool, $task );
} else {
throw new CurlMulti_Exception ( 'taskPoolType not found, taskPoolType=' . $this->taskPoolType );
}
}
}
/**
* Perform the actual task(s).
*/
function start() {
if ($this->isRunning) {
throw new CurlMulti_Exception ( __CLASS__ . ' is running !' );
}
if (false === $this->isConstructCalled) {
throw new CurlMulti_Exception ( __CLASS__ . ' __construct is not called' );
}
$this->mh = curl_multi_init ();
$this->info ['all'] ['startTime'] = time ();
$this->info ['all'] ['timeStartDownload'] = null;
$this->info ['all'] ['downloadSize'] = 0;
$this->info ['all'] ['finishNum'] = 0;
$this->info ['all'] ['cacheNum'] = 0;
$this->info ['all'] ['failNum'] = 0;
$this->info ['all'] ['taskNum'] = 0;
$this->info ['all'] ['taskRunningNumNoType'] = 0;
$this->setThreadData ();
$this->isRunning = true;
$this->addTask ();
do {
$this->exec ();
curl_multi_select ( $this->mh );
$this->callCbInfo ();
if (isset ( $this->cbUser )) {
call_user_func ( $this->cbUser );
}
while ( false != ($curlInfo = curl_multi_info_read ( $this->mh, $this->info ['all'] ['queueNum'] )) ) {
$ch = $curlInfo ['handle'];
$task = $this->taskRunning [( int ) $ch];
$info = curl_getinfo ( $ch );
$this->info ['all'] ['downloadSize'] += $info ['size_download'];
if (isset ( $task [self::TASK_FP] )) {
fclose ( $task [self::TASK_FP] );
}
if ($curlInfo ['result'] == CURLE_OK) {
$param = array ();
$param ['info'] = $info;
$param ['ext'] = array (
'ch' => $ch
);
if (! isset ( $task [self::TASK_ITEM_FILE] )) {
$param ['content'] = curl_multi_getcontent ( $ch );
}
}
curl_multi_remove_handle ( $this->mh, $ch );
// must close first,other wise download may be not commpleted in process callback
if (! array_key_exists ( 'close', $task [self::TASK_ITEM_CTL] ) || $task [self::TASK_ITEM_CTL] ['close'] == true) {
curl_close ( $ch );
}
if ($curlInfo ['result'] == CURLE_OK) {
$this->process ( $task, $param, false );
}
// error handle
$callFail = false;
if ($curlInfo ['result'] !== CURLE_OK || isset ( $this->userError )) {
if ($task [self::TASK_TRYED] >= $this->maxTry) {
// user error
if (isset ( $this->userError )) {
$err = array (
'error' => $this->userError
);
} else {
$err = array (
'error' => array (
$curlInfo ['result'],
curl_error ( $ch )
)
);
}
$err ['info'] = $info;
if (isset ( $task [self::TASK_FAIL] ) || isset ( $this->cbFail )) {
array_unshift ( $task [self::TASK_ITEM_ARGS], $err );
$callFail = true;
} else {
echo "\nError " . implode ( ', ', $err ['error'] ) . ", url=$info[url]\n";
}
$this->info ['all'] ['failNum'] ++;
} else {
$task [self::TASK_TRYED] ++;
$task [self::TASK_ITEM_CTL] ['useCache'] = false;
$this->taskFail [] = $task;
$this->info ['all'] ['taskNum'] ++;
}
if (isset ( $this->userError )) {
unset ( $this->userError );
}
}
if ($callFail) {
if (isset ( $task [self::TASK_FAIL] )) {
call_user_func_array ( $task [self::TASK_FAIL], $task [self::TASK_ITEM_ARGS] );
} elseif (isset ( $this->cbFail )) {
call_user_func_array ( $this->cbFail, $task [self::TASK_ITEM_ARGS] );
}
}
unset ( $this->taskRunning [( int ) $ch] );
if (array_key_exists ( 'type', $task [self::TASK_ITEM_CTL] )) {
$this->info ['all'] ['taskRunningNumType'] [$task [self::TASK_ITEM_CTL] ['type']] --;
} else {
$this->info ['all'] ['taskRunningNumNoType'] --;
}
$this->addTask ();
$this->info ['all'] ['finishNum'] ++;
// if $this->info['all']['queueNum'] grow very fast there will be no efficiency lost,because outer $this->exec() won't be executed.
$this->exec ();
$this->callCbInfo ();
if (isset ( $this->cbUser )) {
call_user_func ( $this->cbUser );
}
}
} while ( $this->info ['all'] ['activeNum'] || $this->info ['all'] ['queueNum'] || ! empty ( $this->taskFail ) || ! empty ( $this->taskRunning ) || ! empty ( $this->taskPool ) );
$this->callCbInfo ( true );
curl_multi_close ( $this->mh );
unset ( $this->mh );
$this->isRunning = false;
}
/**
* call $this->cbInfo
*/
private function callCbInfo($force = false) {
static $lastTime;
if (! isset ( $lastTime )) {
$lastTime = time ();
}
$now = time ();
if (($force || $now - $lastTime > 0) && isset ( $this->cbInfo )) {
$lastTime = $now;
$this->info ['all'] ['taskPoolNum'] = count ( $this->taskPool );
$this->info ['all'] ['taskRunningNum'] = count ( $this->taskRunning );
$this->info ['all'] ['taskFailNum'] = count ( $this->taskFail );
if ($this->info ['all'] ['taskNum'] > 0) {
$this->info ['all'] ['finishPercent'] = round ( $this->info ['all'] ['finishNum'] / $this->info ['all'] ['taskNum'], 4 );
}
$this->info ['all'] ['timeSpent'] = time () - $this->info ['all'] ['startTime'];
if (isset ( $this->info ['all'] ['timeStartDownload'] )) {
$this->info ['all'] ['timeSpentDownload'] = time () - $this->info ['all'] ['timeStartDownload'];
}
if ($this->info ['all'] ['timeSpentDownload'] > 0) {
$this->info ['all'] ['taskSpeedNoCache'] = round ( ($this->info ['all'] ['finishNum'] - $this->info ['all'] ['cacheNum']) / $this->info ['all'] ['timeSpentDownload'], 2 );
$this->info ['all'] ['downloadSpeed'] = round ( $this->info ['all'] ['downloadSize'] / $this->info ['all'] ['timeSpentDownload'], 2 );
}
// running
$this->info ['running'] = array ();
foreach ( $this->taskRunning as $k => $v ) {
$this->info ['running'] [$k] = curl_getinfo ( $v [self::TASK_CH] );
}
call_user_func_array ( $this->cbInfo, array (
$this->info
) );
}
}
/**
* set $this->maxThreadNoType, $this->info['all']['taskRunningNumType'], $this->info['all']['taskRunningNumNoType'] etc
*/
private function setThreadData() {
$this->maxThreadNoType = $this->maxThread - array_sum ( $this->maxThreadType );
if ($this->maxThreadNoType < 0) {
$this->maxThreadNoType = 0;
}
// unset none exitst type num
foreach ( $this->info ['all'] ['taskRunningNumType'] as $k => $v ) {
if ($v == 0 && ! array_key_exists ( $k, $this->maxThreadType )) {
unset ( $this->info ['all'] ['taskRunningNumType'] [$k] );
}
}
// init type num
foreach ( $this->maxThreadType as $k => $v ) {
if ($v == 0) {
user_error ( 'maxThreadType[' . $k . '] is 0, task of this type will never be added!', E_USER_WARNING );
}
if (! array_key_exists ( $k, $this->info ['all'] ['taskRunningNumType'] )) {
$this->info ['all'] ['taskRunningNumType'] [$k] = 0;
}
}
}
/**
* curl_multi_exec()
*/
private function exec() {
while ( curl_multi_exec ( $this->mh, $this->info ['all'] ['activeNum'] ) === CURLM_CALL_MULTI_PERFORM ) {
}
}
/**
* add a task to curl, keep $this->maxThread concurrent automatically
*/
private function addTask() {
$c = $this->maxThread - count ( $this->taskRunning );
while ( $c > 0 ) {
$task = array ();
// search failed first
if (! empty ( $this->taskFail )) {
$task = array_pop ( $this->taskFail );
} else {
// cbTask
if (0 < ($this->maxThread - count ( $this->taskPool )) and ! empty ( $this->cbTask )) {
if (! isset ( $this->cbTask [1] )) {
$this->cbTask [1] = array ();
}
call_user_func_array ( $this->cbTask [0], array (
$this->cbTask [1]
) );
}
if (! empty ( $this->taskPoolAhead )) {
$task = array_pop ( $this->taskPoolAhead );
} elseif (! empty ( $this->taskPool )) {
if ($this->taskPoolType == 'stack') {
$task = array_pop ( $this->taskPool );
} elseif ($this->taskPoolType == 'queue') {
$task = array_shift ( $this->taskPool );
} else {
throw new CurlMulti_Exception ( 'taskPoolType not found, taskPoolType=' . $this->taskPoolType );
}
}
}
$noAdd = false;
$cache = null;
if (! empty ( $task )) {
if (true == $task [self::TASK_ITEM_CTL] ['cache'] ['enable'] || $this->cache ['enable']) {
$cache = $this->cache ( $task );
if (null !== $cache) {
if (isset ( $task [self::TASK_ITEM_FILE] )) {
file_put_contents ( $task [self::TASK_ITEM_FILE], $cache ['content'], LOCK_EX );
unset ( $cache ['content'] );
}
$this->process ( $task, $cache, true );
$this->info ['all'] ['cacheNum'] ++;
$this->info ['all'] ['finishNum'] ++;
$this->callCbInfo ();
}
}
if (! $cache) {
$this->setThreadData ();
if (array_key_exists ( 'type', $task [self::TASK_ITEM_CTL] ) && ! array_key_exists ( $task [self::TASK_ITEM_CTL] ['type'], $this->maxThreadType )) {
user_error ( 'task was set to notype because type was not set in $this->maxThreadType, type=' . $task [self::TASK_ITEM_CTL] ['type'], E_USER_WARNING );
unset ( $task [self::TASK_ITEM_CTL] ['type'] );
}
if (array_key_exists ( 'type', $task [self::TASK_ITEM_CTL] )) {
$maxThread = $this->maxThreadType [$task [self::TASK_ITEM_CTL] ['type']];
$isNoType = false;
} else {
$maxThread = $this->maxThreadNoType;
$isNoType = true;
}
if ($isNoType && $maxThread == 0) {
user_error ( 'task was disgarded because maxThreadNoType=0, url=' . $task [self::TASK_ITEM_URL], E_USER_WARNING );
}
if (($isNoType && $this->info ['all'] ['taskRunningNumNoType'] < $maxThread) || (! $isNoType && $this->info ['all'] ['taskRunningNumType'] [$task [self::TASK_ITEM_CTL] ['type']] < $maxThread)) {
$task [self::TASK_CH] = $this->curlInit ( $task [self::TASK_ITEM_URL] );
// is a download task?
if (isset ( $task [self::TASK_ITEM_FILE] )) {
// curl can create the last level directory
$dir = dirname ( $task [self::TASK_ITEM_FILE] );
if (! file_exists ( $dir ))
mkdir ( $dir, 0777 );
$task [self::TASK_FP] = fopen ( $task [self::TASK_ITEM_FILE], 'w' );
curl_setopt ( $task [self::TASK_CH], CURLOPT_FILE, $task [self::TASK_FP] );
}
// single task curl option
if (isset ( $task [self::TASK_ITEM_OPT] )) {
foreach ( $task [self::TASK_ITEM_OPT] as $k => $v ) {
curl_setopt ( $task [self::TASK_CH], $k, $v );
}
}
$this->taskRunning [( int ) $task [self::TASK_CH]] = $task;
if (! isset ( $this->info ['all'] ['timeStartDownload'] )) {
$this->info ['all'] ['timeStartDownload'] = time ();
}
if ($isNoType) {
$this->info ['all'] ['taskRunningNumNoType'] ++;
} else {
$this->info ['all'] ['taskRunningNumType'] [$task [self::TASK_ITEM_CTL] ['type']] ++;
}
curl_multi_add_handle ( $this->mh, $task [self::TASK_CH] );
} else {
// rotate task to pool
if ($task [self::TASK_TRYED] > 0) {
array_unshift ( $this->taskFail, $task );
} else {
array_unshift ( $this->taskPool, $task );
}
$noAdd = true;
}
}
}
if (! $cache || $noAdd) {
$c --;
}
}
}
/**
* do process
*
* @param unknown $task
* @param unknown $r
* @param unknown $isCache
*/
private function process($task, $r, $isCache) {
array_unshift ( $task [self::TASK_ITEM_ARGS], $r );
if (isset ( $task [self::TASK_PROCESS] )) {
$userRes = call_user_func_array ( $task [self::TASK_PROCESS], $task [self::TASK_ITEM_ARGS] );
}
if (! isset ( $userRes )) {
$userRes = true;
}
array_shift ( $task [self::TASK_ITEM_ARGS] );
// backoff
if (false === $userRes) {
if (false == $this->cache ['enable'] && false == $task [self::TASK_ITEM_CTL] ['cache'] ['enable']) {
$task [self::TASK_ITEM_CTL] ['cache'] = array (
'enable' => true,
'expire' => 3600
);
}
$this->addTaskPool ( $task );
}
// write cache
if (false == $isCache && false == isset ( $this->userError ) && (true == $task [self::TASK_ITEM_CTL] ['cache'] ['enable']) || $this->cache ['enable']) {
$this->cache ( $task, $r );
}
}
/**
* set or get file cache
*
* @param string $url
* @param mixed $content
* array('info','content')
* @return return array|null|boolean
*/
private function cache($task, $content = null) {
if (! isset ( $this->cache ['dir'] ))
throw new CurlMulti_Exception ( 'Cache dir is not defined' );
$url = $task [self::TASK_ITEM_URL];
$key = md5 ( $url );
$isDownload = isset ( $task [self::TASK_ITEM_FILE] );
$file = rtrim ( $this->cache ['dir'], '/' ) . '/';
if (isset ( $this->cache ['dirLevel'] ) && $this->cache ['dirLevel'] != 0) {
if ($this->cache ['dirLevel'] == 1) {
$file .= substr ( $key, 0, 3 ) . '/' . substr ( $key, 3 );
} elseif ($this->cache ['dirLevel'] == 2) {
$file .= substr ( $key, 0, 3 ) . '/' . substr ( $key, 3, 3 ) . '/' . substr ( $key, 6 );
} else {
throw new CurlMulti_Exception ( 'cache dirLevel is invalid, dirLevel=' . $this->cache ['dirLevel'] );
}
} else {
$file .= $key;
}
$r = null;
if (! isset ( $content )) {
if (file_exists ( $file )) {
if (true == $task [self::TASK_ITEM_CTL] ['cache'] ['enable']) {
$expire = $task [self::TASK_ITEM_CTL] ['cache'] ['expire'];
} else {
$expire = $this->cache ['expire'];
}
if (time () - filemtime ( $file ) < $expire) {
$r = file_get_contents ( $file );
if ($this->cache ['compress']) {
$r = gzuncompress ( $r );
}
$r = unserialize ( $r );
if ($isDownload) {
$r ['content'] = base64_decode ( $r ['content'] );
}
}
}
} else {
$r = false;
// check main cache directory
if (! is_dir ( $this->cache ['dir'] )) {
throw new CurlMulti_Exception ( "Cache dir doesn't exists" );
} else {
$dir = dirname ( $file );
// level 1 subdir
if (isset ( $this->cache ['dirLevel'] ) && $this->cache ['dirLevel'] > 1) {
$dir1 = dirname ( $dir );
if (! is_dir ( $dir1 ) && ! mkdir ( $dir1 )) {
throw new CurlMulti_Exception ( 'Create dir failed, dir=' . $dir1 );
}
}
if (! is_dir ( $dir ) && ! mkdir ( $dir )) {
throw new CurlMulti_Exception ( 'Create dir failed, dir=' . $dir );
}
if ($isDownload) {
$content ['content'] = base64_encode ( file_get_contents ( $task [self::TASK_ITEM_FILE] ) );
}
$content = serialize ( $content );
if ($this->cache ['compress']) {
$content = gzcompress ( $content );
}
if (file_put_contents ( $file, $content, LOCK_EX )) {
$r = true;
} else {
throw new CurlMulti_Exception ( 'Write cache file failed' );
}
}
}
return $r;
}
/**
* user error for current callback
* not curl error
* must be called in process callback
*
* @param unknown $msg
*/
function error($msg) {
$this->userError = array (
CURLE_OK,
$msg
);
}
/**
* return a default $ch initialized with global opt
*
* @param unknown $url
* @return resource
*/
function getch($url = null) {
return $this->curlInit ( $url );
}
/**
* get curl handle
*
* @param string $url
* @return resource
*/
private function curlInit($url = null) {
$ch = curl_init ();
$opt = array ();
if (isset ( $url )) {
$opt [CURLOPT_URL] = $url;
}
$opt [CURLOPT_HEADER] = false;
$opt [CURLOPT_CONNECTTIMEOUT] = 10;
$opt [CURLOPT_TIMEOUT] = 30;
$opt [CURLOPT_AUTOREFERER] = true;
$opt [CURLOPT_USERAGENT] = 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.47 Safari/536.11';
$opt [CURLOPT_RETURNTRANSFER] = true;
$opt [CURLOPT_FOLLOWLOCATION] = true;
$opt [CURLOPT_MAXREDIRS] = 10;
// user defined opt
if (! empty ( $this->opt )) {
foreach ( $this->opt as $k => $v ) {
$opt [$k] = $v;
}
}
curl_setopt_array ( $ch, $opt );
return $ch;
}
}
class CurlMulti_Exception extends Exception {
}

View File

@@ -1,934 +0,0 @@
<?php
/**
*
* @desc HTTP 请求类, 支持 CURL 和 Socket, 默认使用 CURL , 当手动指定
* useCurl 或者 curl 扩展没有安装时, 会使用 Socket
* 目前支持 get 和 post 两种请求方式
*
* @example
*
1. 基本 get 请求:
$http = new Http(); // 实例化对象
$result = $http->get('http://weibo.com/at/comment');
2. 基本 post 请求:
$http = new Http(); // 实例化对象
$result = $http->post('http://someurl.com/post-new-article', array('title'=>$title, 'body'=>$body) );
3. 模拟登录 ( post 和 get 同时使用, 利用 cookie 存储状态 ) :
$http = new Http(); // 实例化对象
$http->setCookiepath(substr(md5($username), 0, 10)); // 设置 cookie, 如果是多个用户请求的话
// 提交 post 数据
$loginData = $http->post('http://login.sina.com.cn/sso/login.php?client=ssologin.js(v1.3.19)', array('username'=>$username, 'loginPass'=>$password) );
$result = $http->get('http://weibo.com/at/comment');
4. 利用 initialize 函数设置多个 config 信息
$httpConfig['method'] = 'GET';
$httpConfig['target'] = 'http://www.somedomain.com/index.html';
$httpConfig['referrer'] = 'http://www.somedomain.com';
$httpConfig['user_agent'] = 'My Crawler';
$httpConfig['timeout'] = '30';
$httpConfig['params'] = array('var1' => 'testvalue', 'var2' => 'somevalue');
$http = new Http();
$http->initialize($httpConfig);
$result = $http->result;
5. 复杂的设置:
$http = new Http();
$http->useCurl(false); // 不使用 curl
$http->setMethod('POST'); // 使用 POST method
// 设置 POST 数据
$http->addParam('user_name' , 'yourusername');
$http->addParam('password' , 'yourpassword');
// Referrer
$http->setReferrer('https://yourproject.projectpath.com/login');
// 开始执行请求
$http->execute('https://yourproject.projectpath.com/login/authenticate');
$result = $http->getResult();
6. 获取开启了 basic auth 的请求
$http = new Http();
// Set HTTP basic authentication realms
$http->setAuth('yourusername', 'yourpassword');
// 获取某个被保护的应用的 feed
$http->get('http://www.someblog.com/protected/feed.xml');
$result = $http->result;
*
* @from http://www.phpfour.com/lib/http
* @since Version 0.1
* @original author Md Emran Hasan <phpfour@gmail.com>
* @modify by Charlie Jade
*/
class Http
{
/** 目标请求 @var string */
var $target;
/** 目标 URL 的 host @var string */
var $host;
/** 请求目标的端口 @var integer */
var $port;
/** 请求目标的 path @var string */
var $path;
/** 请求目标的 schema @var string */
var $schema;
/** 请求的 method (GET 或者 POST) @var string */
var $method;
/** 请求的数据 @var array */
var $params;
/** 请求时候的 cookie 数据 @var array */
var $cookies;
/** 请求返回的 cookie 数据 @var array */
var $_cookies;
/** 请求超时时间, 默认是 25 @var integer */
var $timeout;
/** 是否使用 cURL , 默认为 TRUE @var boolean */
var $useCurl;
/** referrer 信息 @var string */
var $referrer;
/** 请求客户端 User agent @var string */
var $userAgent;
/** Contains the cookie path (to be used with cURL) @var string */
var $cookiePath;
/** 是否使用 Cookie @var boolean */
var $useCookie;
/** 是否为下一次请求保存 Cookie @var boolean */
var $saveCookie;
/** HTTP Basic Auth 用户名 (for authentication) @var string */
var $username;
/** HTTP Basic Auth 密码 (for authentication) @var string */
var $password;
/** 请求的结果集 @var string */
var $result;
/** 最后一个请求的 headers 信息 @var array */
var $headers;
/** Contains the last call's http status code @var string */
var $status;
/** 是否跟随 http redirect 跳转 @var boolean */
var $redirect;
/** 最大 http redirect 调整数 @var integer */
var $maxRedirect;
/** 当前请求有多少个 URL @var integer */
var $curRedirect;
/** 错误代码 @var string */
var $error;
/** Store the next token @var string */
var $nextToken;
/** 是否存储 bug 信息 @var boolean */
var $debug;
/** Stores the debug messages @var array @todo will keep debug messages */
var $debugMsg;
/** Constructor for initializing the class with default values. @return void */
public function __construct()
{
// 先初始化
$this->clear();
}
/**
* 初始化配置信息
* Initialize preferences
*
* This function will take an associative array of config values and
* will initialize the class variables using them.
*
* Example use:
*
* <pre>
* $httpConfig['method'] = 'GET';
* $httpConfig['target'] = 'http://www.somedomain.com/index.html';
* $httpConfig['referrer'] = 'http://www.somedomain.com';
* $httpConfig['user_agent'] = 'My Crawler';
* $httpConfig['timeout'] = '30';
* $httpConfig['params'] = array('var1' => 'testvalue', 'var2' => 'somevalue');
*
* $http = new Http();
* $http->initialize($httpConfig);
* </pre>
*
* @param array Config values as associative array
* @return void
*/
public function initialize($config = array())
{
$this->clear();
foreach ($config as $key => $val)
{
if (isset($this->$key))
{
$method = 'set' . ucfirst(str_replace('_', '', $key));
if (method_exists($this, $method))
{
$this->$method($val);
}
else
{
$this->$key = $val;
}
}
}
}
/**
* 初始化所有
*
* Clears all the properties of the class and sets the object to
* the beginning state. Very handy if you are doing subsequent calls
* with different data.
*
* @return void
*/
public function clear()
{
// Set the request defaults
$this->host = '';
$this->port = 0;
$this->path = '';
$this->target = '';
$this->method = 'GET';
$this->schema = 'http';
$this->params = array();
$this->headers = array();
$this->cookies = array();
$this->_cookies = array();
// Set the config details
$this->debug = FALSE;
$this->error = '';
$this->status = 0;
$this->timeout = '25';
$this->useCurl = TRUE;
$this->referrer = '';
$this->username = '';
$this->password = '';
$this->redirect = TRUE;
// Set the cookie and agent defaults
$this->nextToken = '';
$this->useCookie = FALSE;
$this->saveCookie = FALSE;
$this->maxRedirect = 3;
$this->cookiePath = 'cookie.txt';
$this->userAgent = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.7 (KHTML, like Gecko) Chrome/16.0.912.63 Safari/535.7';
}
/** 设置目标 @return void */
public function setTarget($url)
{
$this->target = $url;
}
/** 设置 http 请求方法 @param string HTTP method to use (GET or POST) @return void */
public function setMethod($method)
{
$this->method = $method;
}
/** 设置 referrer URL @param string URL of referrer page @return void */
public function setReferrer($referrer)
{
$this->referrer = $referrer;
}
/** 设置 User agent @param string Full user agent string @return void */
public function setUseragent($agent)
{
$this->userAgent = $agent;
}
/** 设置请求 timeout @param integer Timeout delay in seconds @return void */
public function setTimeout($seconds)
{
$this->timeout = $seconds;
}
/** 设置 cookie path (只支持cURL ) @param string File location of cookiejar @return void */
public function setCookiepath($path)
{
$this->cookiePath = $path;
$this->useCookie(TRUE);
$this->saveCookie(TRUE);
}
/** 设置请求参数 parameters @param array GET or POST 的请求数据 @return void */
public function setParams($dataArray)
{
$this->params = array_merge($this->params, $dataArray);
}
/** 设置 basic http auth 域验证 @param string 用户名 @param string 密码 @return void */
public function setAuth($username, $password)
{
$this->username = $username;
$this->password = $password;
}
/** 设置最大跳转数 @param integer Maximum number of redirects @return void */
public function setMaxredirect($value)
{
$this->maxRedirect = $value;
}
/** 添加多一个新的请求数据 @param string Name of the parameter @param string Value of the paramete @return void */
public function addParam($name, $value)
{
$this->params[$name] = $value;
}
/** 添加 cookie 请求数据 @param string Name of cookie @param string Value of cookie */
public function addCookie($name, $value)
{
$this->cookies[$name] = $value;
}
/** 是否使用 curl, 默认 true, false 为使用 socket */
public function useCurl($value = TRUE)
{
if (is_bool($value))
{
$this->useCurl = $value;
}
}
/** 是否使用 cookie , 默认为 false @param boolean Whether to use cookies or not @return void */
public function useCookie($value = FALSE)
{
$this->useCookie = $value;
}
/** 是否使用 cookie , 以供下一次请求使用 @param boolean Whether to save persistent cookies or not @return void */
public function saveCookie($value = FALSE)
{
$this->saveCookie = $value;
}
/** 是否跟随 302 跳转 @param boolean Whether to follow HTTP redirects or not */
public function followRedirects($value = TRUE)
{
$this->redirect = $value;
}
/** 获取结果集 @return string output of execution */
public function getResult()
{
return $this->result;
}
/** 获取最后一个返回的 headers 数组 */
public function getHeaders()
{
return $this->headers;
}
/** 获取请求的状态码 */
public function getStatus()
{
return $this->status;
}
/** 获取最后运行错误 */
public function getError()
{
return $this->error;
}
/** 执行一条 http get 请求 */
public function get($url, $data=array()){
return $this->execute($url, '', 'GET', $data);
}
/** 执行一条 http post 请求 */
public function post($url, $data=array()){
return $this->execute($url, '', 'POST', $data);
}
/**
* 使用当前的配置, 发送一条 HTTP 请求
*
* @param string URL of the target page (optional)
* @param string URL of the referrer page (optional)
* @param string 请求方法 (GET or POST) (optional)
* @param array 请求数据, key 和 value 对应的数组 (optional)
* @return string 请求的结果集
*/
public function execute($target = '', $referrer = '', $method = '', $data = array())
{
// Populate the properties
$this->target = ($target) ? $target : $this->target;
$this->method = ($method) ? $method : $this->method;
$this->referrer = ($referrer) ? $referrer : $this->referrer;
// Add the new params
if (is_array($data) && count($data) > 0)
{
$this->params = array_merge($this->params, $data);
}
// Process data, if presented
if(is_array($this->params) && count($this->params) > 0)
{
// Get a blank slate
$tempString = array();
// Convert data array into a query string (ie animal=dog&sport=baseball)
foreach ($this->params as $key => $value)
{
if(strlen(trim($value))>0)
{
$tempString[] = $key . "=" . urlencode($value);
}
}
$queryString = join('&', $tempString);
}
// 如果 cURL 没有安装就使用 fscokopen 执行请求
$this->useCurl = $this->useCurl && in_array('curl', get_loaded_extensions());
// GET method configuration
if($this->method == 'GET')
{
if(isset($queryString))
{
$this->target = $this->target . "?" . $queryString;
}
}
// Parse target URL
$urlParsed = parse_url($this->target);
// Handle SSL connection request
if ($urlParsed['scheme'] == 'https')
{
$this->host = 'ssl://' . $urlParsed['host'];
$this->port = ($this->port != 0) ? $this->port : 443;
}
else
{
$this->host = $urlParsed['host'];
$this->port = ($this->port != 0) ? $this->port : 80;
}
// Finalize the target path
$this->path = (isset($urlParsed['path']) ? $urlParsed['path'] : '/') . (isset($urlParsed['query']) ? '?' . $urlParsed['query'] : '');
$this->schema = $urlParsed['scheme'];
// Pass the requred cookies
$this->_passCookies();
// Process cookies, if requested
if(is_array($this->cookies) && count($this->cookies) > 0)
{
// Get a blank slate
$tempString = array();
// Convert cookiesa array into a query string (ie animal=dog&sport=baseball)
foreach ($this->cookies as $key => $value)
{
if(strlen(trim($value)) > 0)
{
$tempString[] = $key . "=" . urlencode($value);
}
}
$cookieString = join('&', $tempString);
}
// Do we need to use cURL
if ($this->useCurl)
{
// Initialize PHP cURL handle
$ch = curl_init();
// GET method configuration
if($this->method == 'GET')
{
curl_setopt ($ch, CURLOPT_HTTPGET, TRUE);
curl_setopt ($ch, CURLOPT_POST, FALSE);
}
// POST method configuration
else
{
if(isset($queryString))
{
curl_setopt ($ch, CURLOPT_POSTFIELDS, $queryString);
}
curl_setopt ($ch, CURLOPT_POST, TRUE);
curl_setopt ($ch, CURLOPT_HTTPGET, FALSE);
}
// Basic Authentication configuration
if ($this->username && $this->password)
{
curl_setopt($ch, CURLOPT_USERPWD, $this->username . ':' . $this->password);
}
// Custom cookie configuration
if($this->useCookie && isset($cookieString))
{
curl_setopt ($ch, CURLOPT_COOKIE, $cookieString);
}
curl_setopt($ch, CURLOPT_HEADER, array('Accept-Language: zh-cn','Connection: Keep-Alive','Cache-Control: no-cache'));
curl_setopt($ch, CURLOPT_NOBODY, FALSE); // Return body
curl_setopt($ch, CURLOPT_COOKIEJAR, $this->cookiePath); // cookie 文件
curl_setopt($ch, CURLOPT_COOKIEFILE, $this->cookiePath); // cookie 文件
curl_setopt($ch, CURLOPT_TIMEOUT, $this->timeout); // Timeout
curl_setopt($ch, CURLOPT_USERAGENT, $this->userAgent); // Webbot name
curl_setopt($ch, CURLOPT_URL, $this->target); // Target site
curl_setopt($ch, CURLOPT_REFERER, $this->referrer); // Referer value
curl_setopt($ch, CURLOPT_VERBOSE, FALSE); // Minimize logs
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, FALSE); // No certificate
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, $this->redirect); // Follow redirects
curl_setopt($ch, CURLOPT_MAXREDIRS, $this->maxRedirect); // Limit redirections to four
curl_setopt($ch, CURLOPT_RETURNTRANSFER, TRUE); // 是否以 string 格式返回
// Get the target contents
$content = curl_exec($ch);
// Get the request info
$curl_info = curl_getinfo($ch);
$header_size = $curl_info["header_size"];
// 赋值结果集
$this->result = substr($content, $header_size);
$reader = explode("\r\n\r\n", trim(substr($content, 0, $header_size)));
$this->status = $curl_info['http_code'];
// Parse the headers
$this->_parseHeaders( explode("\r\n\r\n", trim(substr($content, 0, $header_size))) );
// Store the error (is any)
$this->_setError(curl_error($ch));
// Close PHP cURL handle
curl_close($ch);
}
else
{
// Get a file pointer
$filePointer = fsockopen($this->host, $this->port, $errorNumber, $errorString, $this->timeout);
// We have an error if pointer is not there
if (!$filePointer)
{
$this->_setError('Failed opening http socket connection: ' . $errorString . ' (' . $errorNumber . ')');
return FALSE;
}
// Set http headers with host, user-agent and content type
$requestHeader = $this->method . " " . $this->path . " HTTP/1.1\r\n";
$requestHeader .= "Host: " . $urlParsed['host'] . "\r\n";
$requestHeader .= "User-Agent: " . $this->userAgent . "\r\n";
$requestHeader .= "Content-Type: application/x-www-form-urlencoded\r\n";
// Specify the custom cookies
if ($this->useCookie && $cookieString != '')
{
$requestHeader.= "Cookie: " . $cookieString . "\r\n";
}
// POST method configuration
if ($this->method == "POST")
{
$requestHeader.= "Content-Length: " . strlen($queryString) . "\r\n";
}
// Specify the referrer
if ($this->referrer != '')
{
$requestHeader.= "Referer: " . $this->referrer . "\r\n";
}
// Specify http authentication (basic)
if ($this->username && $this->password)
{
$requestHeader.= "Authorization: Basic " . base64_encode($this->username . ':' . $this->password) . "\r\n";
}
$requestHeader.= "Connection: close\r\n\r\n";
// POST method configuration
if ($this->method == "POST")
{
$requestHeader .= $queryString;
}
// We're ready to launch
fwrite($filePointer, $requestHeader);
// Clean the slate
$responseHeader = '';
$responseContent = '';
// 3...2...1...Launch !
do
{
$responseHeader .= fread($filePointer, 1);
}
while (!preg_match('/\\r\\n\\r\\n$/', $responseHeader));
// Parse the headers
$this->_parseHeaders($responseHeader);
// Do we have a 301/302 redirect ?
if (($this->status == '301' || $this->status == '302') && $this->redirect == TRUE)
{
if ($this->curRedirect < $this->maxRedirect)
{
// Let's find out the new redirect URL
$newUrlParsed = parse_url($this->headers['location']);
if ($newUrlParsed['host'])
{
$newTarget = $this->headers['location'];
}
else
{
$newTarget = $this->schema . '://' . $this->host . '/' . $this->headers['location'];
}
// Reset some of the properties
$this->port = 0;
$this->status = 0;
$this->params = array();
$this->method = 'GET';
$this->referrer = $this->target;
// Increase the redirect counter
$this->curRedirect++;
// Let's go, go, go !
$this->result = $this->execute($newTarget);
}
else
{
$this->_setError('Too many redirects.');
return FALSE;
}
}
else
{
// Nope...so lets get the rest of the contents (non-chunked)
if ($this->headers['transfer-encoding'] != 'chunked')
{
while (!feof($filePointer))
{
$responseContent .= fgets($filePointer, 128);
}
}
else
{
// Get the contents (chunked)
while ($chunkLength = hexdec(fgets($filePointer)))
{
$responseContentChunk = '';
$readLength = 0;
while ($readLength < $chunkLength)
{
$responseContentChunk .= fread($filePointer, $chunkLength - $readLength);
$readLength = strlen($responseContentChunk);
}
$responseContent .= $responseContentChunk;
fgets($filePointer);
}
}
// Store the target contents
$this->result = chop($responseContent);
}
}
// There it is! We have it!! Return to base !!!
return $this->result;
}
/** 解析 header 信息*/
private function _parseHeaders($responseHeader)
{
// Break up the headers
$headers = $responseHeader;
// Clear the header array
$this->_clearHeaders();
// Get resposne status
if($this->status == 0)
{
// Oooops !
if(!eregi($match = "^http/[0-9]+\\.[0-9]+[ \t]+([0-9]+)[ \t]*(.*)\$", $headers[0], $matches))
{
$this->_setError('Unexpected HTTP response status');
return FALSE;
}
// Gotcha!
$this->status = $matches[1];
array_shift($headers);
}
// Prepare all the other headers
foreach ($headers as $header)
{
// Get name and value
$headerName = strtolower($this->_tokenize($header, ':'));
$headerValue = trim(chop($this->_tokenize("\r\n")));
// If its already there, then add as an array. Otherwise, just keep there
if(isset($this->headers[$headerName]))
{
if(gettype($this->headers[$headerName]) == "string")
{
$this->headers[$headerName] = array($this->headers[$headerName]);
}
$this->headers[$headerName][] = $headerValue;
}
else
{
$this->headers[$headerName] = $headerValue;
}
}
// Save cookies if asked
if ($this->saveCookie && isset($this->headers['set-cookie']))
{
$this->_parseCookie();
}
}
/** 去除所有 header 信息 */
private function _clearHeaders()
{
$this->headers = array();
}
/** 解析 COOKIE */
private function _parseCookie()
{
// Get the cookie header as array
if(gettype($this->headers['set-cookie']) == "array")
{
$cookieHeaders = $this->headers['set-cookie'];
}
else
{
$cookieHeaders = array($this->headers['set-cookie']);
}
// Loop through the cookies
for ($cookie = 0; $cookie < count($cookieHeaders); $cookie++)
{
$cookieName = trim($this->_tokenize($cookieHeaders[$cookie], "="));
$cookieValue = $this->_tokenize(";");
$urlParsed = parse_url($this->target);
$domain = $urlParsed['host'];
$secure = '0';
$path = "/";
$expires = "";
while(($name = trim(urldecode($this->_tokenize("=")))) != "")
{
$value = urldecode($this->_tokenize(";"));
switch($name)
{
case "path" : $path = $value; break;
case "domain" : $domain = $value; break;
case "secure" : $secure = ($value != '') ? '1' : '0'; break;
}
}
$this->_setCookie($cookieName, $cookieValue, $expires, $path , $domain, $secure);
}
}
/** 设置 cookie , 为下一次请求做准备 */
private function _setCookie($name, $value, $expires = "" , $path = "/" , $domain = "" , $secure = 0)
{
if(strlen($name) == 0)
{
return($this->_setError("No valid cookie name was specified."));
}
if(strlen($path) == 0 || strcmp($path[0], "/"))
{
return($this->_setError("$path is not a valid path for setting cookie $name."));
}
if($domain == "" || !strpos($domain, ".", $domain[0] == "." ? 1 : 0))
{
return($this->_setError("$domain is not a valid domain for setting cookie $name."));
}
$domain = strtolower($domain);
if(!strcmp($domain[0], "."))
{
$domain = substr($domain, 1);
}
$name = $this->_encodeCookie($name, true);
$value = $this->_encodeCookie($value, false);
$secure = intval($secure);
$this->_cookies[] = array( "name" => $name,
"value" => $value,
"domain" => $domain,
"path" => $path,
"expires" => $expires,
"secure" => $secure
);
}
/** cookie 数据集编码 */
private function _encodeCookie($value, $name)
{
return($name ? str_replace("=", "%25", $value) : str_replace(";", "%3B", $value));
}
/** 把正确的 cookie 传输给当前请求 */
private function _passCookies()
{
if (is_array($this->_cookies) && count($this->_cookies) > 0)
{
$urlParsed = parse_url($this->target);
$tempCookies = array();
foreach($this->_cookies as $cookie)
{
if ($this->_domainMatch($urlParsed['host'], $cookie['domain']) && (0 === strpos($urlParsed['path'], $cookie['path']))
&& (empty($cookie['secure']) || $urlParsed['protocol'] == 'https'))
{
$tempCookies[$cookie['name']][strlen($cookie['path'])] = $cookie['value'];
}
}
// cookies with longer paths go first
foreach ($tempCookies as $name => $values)
{
krsort($values);
foreach ($values as $value)
{
$this->addCookie($name, $value);
}
}
}
}
/** 匹配域名 */
private function _domainMatch($requestHost, $cookieDomain)
{
if ('.' != $cookieDomain{0})
{
return $requestHost == $cookieDomain;
}
elseif (substr_count($cookieDomain, '.') < 2)
{
return false;
}
else
{
return substr('.'. $requestHost, - strlen($cookieDomain)) == $cookieDomain;
}
}
/** 给当前操作做记号用的 */
private function _tokenize($string, $separator = '')
{
if(!strcmp($separator, ''))
{
$separator = $string;
$string = $this->nextToken;
}
for($character = 0; $character < strlen($separator); $character++)
{
if(gettype($position = strpos($string, $separator[$character])) == "integer")
{
$found = (isset($found) ? min($found, $position) : $position);
}
}
if(isset($found))
{
$this->nextToken = substr($string, $found + 1);
return(substr($string, 0, $found));
}
else
{
$this->nextToken = '';
return($string);
}
}
/** 设置错误信息 */
private function _setError($error)
{
if ($error != '')
{
$this->error = $error;
return $error;
}
}
}
?>

BIN
logo.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 15 KiB

File diff suppressed because it is too large Load Diff

19
phpunit.xml Normal file
View File

@@ -0,0 +1,19 @@
<phpunit
bootstrap="vendor/autoload.php"
convertErrorsToExceptions="true"
convertNoticesToExceptions="true"
convertWarningsToExceptions="true"
>
<testsuites>
<testsuite name="querylist">
<directory>./tests</directory>
</testsuite>
</testsuites>
<filter>
<whitelist>
<directory suffix=".php">src</directory>
</whitelist>
</filter>
</phpunit>

94
src/Config.php Normal file
View File

@@ -0,0 +1,94 @@
<?php
/**
* Created by PhpStorm.
* User: Jaeger <JaegerCode@gmail.com>
* Date: 2017/9/22
*/
namespace QL;
use Closure;
use Tightenco\Collect\Support\Collection;
class Config
{
protected static $instance = null;
protected $plugins;
protected $binds;
/**
* Config constructor.
*/
public function __construct()
{
$this->plugins = new Collection();
$this->binds = new Collection();
}
/**
* Get the Config instance
*
* @return null|Config
*/
public static function getInstance()
{
self::$instance || self::$instance = new self();
return self::$instance;
}
/**
* Global installation plugin
*
* @param $plugins
* @param array ...$opt
* @return $this
*/
public function use($plugins,...$opt)
{
if(is_string($plugins)){
$this->plugins->push([$plugins,$opt]);
}else{
$this->plugins = $this->plugins->merge($plugins);
}
return $this;
}
/**
* Global binding custom method
*
* @param string $name
* @param Closure $provider
* @return $this
*/
public function bind(string $name, Closure $provider)
{
$this->binds[$name] = $provider;
return $this;
}
public function bootstrap(QueryList $queryList)
{
$this->installPlugins($queryList);
$this->installBind($queryList);
}
protected function installPlugins(QueryList $queryList)
{
$this->plugins->each(function($plugin) use($queryList){
if(is_string($plugin)){
$queryList->use($plugin);
}else{
$queryList->use($plugin[0],...$plugin[1]);
}
});
}
protected function installBind(QueryList $queryList)
{
$this->binds->each(function ($provider,$name) use($queryList){
$queryList->bind($name,$provider);
});
}
}

View File

@@ -0,0 +1,15 @@
<?php
/**
* Created by PhpStorm.
* User: Jaeger <JaegerCode@gmail.com>
* Date: 2017/9/22
*/
namespace QL\Contracts;
use QL\QueryList;
interface PluginContract
{
public static function install(QueryList $queryList,...$opt);
}

View File

@@ -0,0 +1,15 @@
<?php
/**
* Created by PhpStorm.
* User: Jaeger <JaegerCode@gmail.com>
* Date: 2017/9/20
*/
namespace QL\Contracts;
use QL\Kernel;
interface ServiceProviderContract
{
public function register(Kernel $kernel);
}

30
src/Dom/Dom.php Normal file
View File

@@ -0,0 +1,30 @@
<?php
/**
* Created by PhpStorm.
* User: Jaeger <JaegerCode@gmail.com>
* Date: 2017/9/19
*/
namespace QL\Dom;
use phpQueryObject;
class Dom
{
protected $document;
/**
* Dom constructor.
*/
public function __construct(phpQueryObject $document)
{
$this->document = $document;
}
public function find($selector)
{
$elements = $this->document->find($selector);
return new Elements($elements);
}
}

260
src/Dom/Elements.php Normal file
View File

@@ -0,0 +1,260 @@
<?php
/**
* Created by PhpStorm.
* User: Jaeger <JaegerCode@gmail.com>
* Date: 2017/9/19
*/
namespace QL\Dom;
use phpDocumentor\Reflection\Types\Null_;
use phpQueryObject;
use Tightenco\Collect\Support\Collection;
/**
* Class Elements
* @package QL\Dom
*
* @method Elements toReference(&$var)
* @method Elements documentFragment($state = null)
* @method Elements toRoot()
* @method Elements getDocumentIDRef(&$documentID)
* @method Elements getDocument()
* @method \DOMDocument getDOMDocument()
* @method Elements getDocumentID()
* @method Elements unloadDocument()
* @method bool isHTML()
* @method bool isXHTML()
* @method bool isXML()
* @method string serialize()
* @method array serializeArray($submit = null)
* @method \DOMElement|\DOMElement[] get($index = null, $callback1 = null, $callback2 = null, $callback3 = null)
* @method string|array getString($index = null, $callback1 = null, $callback2 = null, $callback3 = null)
* @method string|array getStrings($index = null, $callback1 = null, $callback2 = null, $callback3 = null)
* @method Elements newInstance($newStack = null)
* @method Elements find($selectors, $context = null, $noHistory = false)
* @method Elements|bool is($selector, $nodes = null)
* @method Elements filterCallback($callback, $_skipHistory = false)
* @method Elements filter($selectors, $_skipHistory = false)
* @method Elements load($url, $data = null, $callback = null)
* @method Elements trigger($type, $data = [])
* @method Elements triggerHandler($type, $data = [])
* @method Elements bind($type, $data, $callback = null)
* @method Elements unbind($type = null, $callback = null)
* @method Elements change($callback = null)
* @method Elements submit($callback = null)
* @method Elements click($callback = null)
* @method Elements wrapAllOld($wrapper)
* @method Elements wrapAll($wrapper)
* @method Elements wrapAllPHP($codeBefore, $codeAfter)
* @method Elements wrap($wrapper)
* @method Elements wrapPHP($codeBefore, $codeAfter)
* @method Elements wrapInner($wrapper)
* @method Elements wrapInnerPHP($codeBefore, $codeAfter)
* @method Elements contents()
* @method Elements contentsUnwrap()
* @method Elements switchWith($markup)
* @method Elements eq($num)
* @method Elements size()
* @method Elements length()
* @method int count()
* @method Elements end($level = 1)
* @method Elements _clone()
* @method Elements replaceWithPHP($code)
* @method Elements replaceWith($content)
* @method Elements replaceAll($selector)
* @method Elements remove($selector = null)
* @method Elements|string markup($markup = null, $callback1 = null, $callback2 = null, $callback3 = null)
* @method string markupOuter($callback1 = null, $callback2 = null, $callback3 = null)
* @method Elements|string html($html = null, $callback1 = null, $callback2 = null, $callback3 = null)
* @method Elements|string xml($xml = null, $callback1 = null, $callback2 = null, $callback3 = null)
* @method string htmlOuter($callback1 = null, $callback2 = null, $callback3 = null)
* @method string xmlOuter($callback1 = null, $callback2 = null, $callback3 = null)
* @method Elements php($code)
* @method string markupPHP($code)
* @method string markupOuterPHP()
* @method Elements children($selector)
* @method Elements ancestors($selector)
* @method Elements append($content)
* @method Elements appendPHP($content)
* @method Elements appendTo($seletor)
* @method Elements prepend($content)
* @method Elements prependPHP($content)
* @method Elements prependTo($seletor)
* @method Elements before($content)
* @method Elements beforePHP($content)
* @method Elements insertBefore($seletor)
* @method Elements after($content)
* @method Elements afterPHP($content)
* @method Elements insertAfter($seletor)
* @method Elements insert($target, $type)
* @method int index($subject)
* @method Elements slice($start, $end = null)
* @method Elements reverse()
* @method Elements|string text($text = null, $callback1 = null, $callback2 = null, $callback3 = null)
* @method Elements plugin($class, $file = null)
* @method Elements _next($selector = null)
* @method Elements _prev($selector = null)
* @method Elements prev($selector = null)
* @method Elements prevAll($selector = null)
* @method Elements nextAll($selector = null)
* @method Elements siblings($selector = null)
* @method Elements not($selector = null)
* @method Elements add($selector = null)
* @method Elements parent($selector = null)
* @method Elements parents($selector = null)
* @method Elements stack($nodeTypes = null)
* @method Elements|string attr($attr = null, $value = null)
* @method Elements attrPHP($attr, $code)
* @method Elements removeAttr($attr)
* @method Elements|string val($val = null)
* @method Elements andSelf()
* @method Elements addClass($className)
* @method Elements addClassPHP($className)
* @method bool hasClass($className)
* @method Elements removeClass($className)
* @method Elements toggleClass($className)
* @method Elements _empty()
* @method Elements callback($callback, $param1 = null, $param2 = null, $param3 = null)
* @method string data($key, $value = null)
* @method Elements removeData($key)
* @method void rewind()
* @method Elements current()
* @method int key()
* @method Elements next($cssSelector = null)
* @method bool valid()
* @method bool offsetExists($offset)
* @method Elements offsetGet($offset)
* @method void offsetSet($offset, $value)
* @method string whois($oneNode)
* @method Elements dump()
* @method Elements dumpWhois()
* @method Elements dumpLength()
* @method Elements dumpTree($html, $title)
* @method dumpDie()
*/
class Elements
{
/**
* @var phpQueryObject
*/
protected $elements;
/**
* Elements constructor.
* @param $elements
*/
public function __construct(phpQueryObject $elements)
{
$this->elements = $elements;
}
public function __get($name)
{
return property_exists($this->elements, $name) ? $this->elements->$name : $this->elements->attr($name);
}
public function __call($name, $arguments)
{
$obj = call_user_func_array([$this->elements, $name], $arguments);
if ($obj instanceof phpQueryObject) {
$obj = new self($obj);
} else if (is_string($obj)) {
$obj = trim($obj);
}
return $obj;
}
/**
* Iterating elements
*
* @param callable $callback
*
* @return $this
*/
public function each(callable $callback)
{
foreach ($this->elements as $key => $element) {
$break = $callback(new self(pq($element)), $key);
if ($break === false) {
break;
}
}
return $this;
}
/**
* Iterating elements
*
* @param $callback
* @return \Illuminate\Support\Collection|\Tightenco\Collect\Support\Collection
*/
public function map($callback)
{
$collection = new Collection();
$this->elements->each(function ($dom) use (& $collection, $callback) {
$collection->push($callback(new self(pq($dom))));
});
return $collection;
}
/**
* Gets the attributes of all the elements
*
* @param string $attr HTML attribute name
* @return \Illuminate\Support\Collection|\Tightenco\Collect\Support\Collection
*/
public function attrs($attr)
{
return $this->map(function ($item) use ($attr) {
return $item->attr($attr);
});
}
/**
* Gets the text of all the elements
*
* @return \Illuminate\Support\Collection|\Tightenco\Collect\Support\Collection
*/
public function texts()
{
return $this->map(function ($item) {
return trim($item->text());
});
}
/**
* Gets the html of all the elements
*
* @return \Illuminate\Support\Collection|\Tightenco\Collect\Support\Collection
*/
public function htmls()
{
return $this->map(function ($item) {
return trim($item->html());
});
}
/**
* Gets the htmlOuter of all the elements
*
* @return \Illuminate\Support\Collection|\Tightenco\Collect\Support\Collection
*/
public function htmlOuters()
{
return $this->map(function ($item) {
return trim($item->htmlOuter());
});
}
/**
* @return phpQueryObject
*/
public function getElements(): phpQueryObject
{
return $this->elements;
}
}

322
src/Dom/Query.php Normal file
View File

@@ -0,0 +1,322 @@
<?php
/**
* Created by PhpStorm.
* User: Jaeger <JaegerCode@gmail.com>
* Date: 2017/9/21
*/
namespace QL\Dom;
use Tightenco\Collect\Support\Collection;
use phpQuery;
use phpQueryObject;
use QL\QueryList;
use Closure;
class Query
{
protected $html;
/**
* @var \phpQueryObject
*/
protected $document;
protected $rules;
protected $range = null;
protected $ql;
/**
* @var Collection
*/
protected $data;
public function __construct(QueryList $ql)
{
$this->ql = $ql;
}
/**
* @param bool $rel
* @return String
*/
public function getHtml($rel = true)
{
return $rel ? $this->document->htmlOuter() : $this->html;
}
/**
* @param $html
* @param null $charset
* @return QueryList
*/
public function setHtml($html, $charset = null)
{
$this->html = value($html);
$this->destroyDocument();
$this->document = phpQuery::newDocumentHTML($this->html, $charset);
return $this->ql;
}
/**
* Get crawl results
*
* @param Closure|null $callback
* @return Collection|static
*/
public function getData(Closure $callback = null)
{
return $this->handleData($this->data, $callback);
}
/**
* @param Collection $data
*/
public function setData(Collection $data)
{
$this->data = $data;
}
/**
* Searches for all elements that match the specified expression.
*
* @param $selector A string containing a selector expression to match elements against.
* @return Elements
*/
public function find($selector)
{
return (new Dom($this->document))->find($selector);
}
/**
* Set crawl rule
*
* $rules = [
* 'rule_name1' => ['selector','HTML attribute | text | html','Tag filter list','callback'],
* 'rule_name2' => ['selector','HTML attribute | text | html','Tag filter list','callback'],
* // ...
* ]
*
* @param array $rules
* @return QueryList
*/
public function rules(array $rules)
{
$this->rules = $rules;
return $this->ql;
}
/**
* Set the slice area for crawl list
*
* @param $selector
* @return QueryList
*/
public function range($selector)
{
$this->range = $selector;
return $this->ql;
}
/**
* Remove HTML head,try to solve the garbled
*
* @return QueryList
*/
public function removeHead()
{
$html = preg_replace('/(<head>|<head\s+.+?>).+?<\/head>/is', '<head></head>', $this->html);
$html && $this->setHtml($html);
return $this->ql;
}
/**
* Execute the query rule
*
* @param Closure|null $callback
* @return QueryList
*/
public function query(Closure $callback = null)
{
$this->data = $this->getList();
$this->data = $this->handleData($this->data, $callback);
return $this->ql;
}
public function handleData(Collection $data, $callback)
{
if (is_callable($callback)) {
if (empty($this->range)) {
$data = new Collection($callback($data->all(), null));
} else {
$data = $data->map($callback);
}
}
return $data;
}
protected function getList()
{
$data = [];
if (empty($this->range)) {
foreach ($this->rules as $key => $reg_value) {
$rule = $this->parseRule($reg_value);
$contentElements = $this->document->find($rule['selector']);
$data[$key] = $this->extractContent($contentElements, $key, $rule);
}
} else {
$rangeElements = $this->document->find($this->range);
$i = 0;
foreach ($rangeElements as $element) {
foreach ($this->rules as $key => $reg_value) {
$rule = $this->parseRule($reg_value);
$contentElements = pq($element)->find($rule['selector']);
$data[$i][$key] = $this->extractContent($contentElements, $key, $rule);
}
$i++;
}
}
return new Collection($data);
}
protected function extractContent(phpQueryObject $pqObj, $ruleName, $rule)
{
switch ($rule['attr']) {
case 'text':
$content = $this->allowTags($pqObj->html(), $rule['filter_tags']);
break;
case 'texts':
$content = (new Elements($pqObj))->map(function (Elements $element) use ($rule) {
return $this->allowTags($element->html(), $rule['filter_tags']);
})->all();
break;
case 'html':
$content = $this->stripTags($pqObj->html(), $rule['filter_tags']);
break;
case 'htmls':
$content = (new Elements($pqObj))->map(function (Elements $element) use ($rule) {
return $this->stripTags($element->html(), $rule['filter_tags']);
})->all();
break;
case 'htmlOuter':
$content = $this->stripTags($pqObj->htmlOuter(), $rule['filter_tags']);
break;
case 'htmlOuters':
$content = (new Elements($pqObj))->map(function (Elements $element) use ($rule) {
return $this->stripTags($element->htmlOuter(), $rule['filter_tags']);
})->all();
break;
default:
if(preg_match('/attr\((.+)\)/', $rule['attr'], $arr)) {
$content = $pqObj->attr($arr[1]);
} elseif (preg_match('/attrs\((.+)\)/', $rule['attr'], $arr)) {
$content = (new Elements($pqObj))->attrs($arr[1])->all();
} else {
$content = $pqObj->attr($rule['attr']);
}
break;
}
if (is_callable($rule['handle_callback'])) {
$content = call_user_func($rule['handle_callback'], $content, $ruleName);
}
return $content;
}
protected function parseRule($rule)
{
$result = [];
$result['selector'] = $rule[0];
$result['attr'] = $rule[1];
$result['filter_tags'] = $rule[2] ?? '';
$result['handle_callback'] = $rule[3] ?? null;
return $result;
}
/**
* 去除特定的html标签
* @param string $html
* @param string $tags_str 多个标签名之间用空格隔开
* @return string
*/
protected function stripTags($html, $tags_str)
{
$tagsArr = $this->tag($tags_str);
$html = $this->removeTags($html, $tagsArr[1]);
$p = array();
foreach ($tagsArr[0] as $tag) {
$p[] = "/(<(?:\/" . $tag . "|" . $tag . ")[^>]*>)/i";
}
$html = preg_replace($p, "", trim($html));
return $html;
}
/**
* 保留特定的html标签
* @param string $html
* @param string $tags_str 多个标签名之间用空格隔开
* @return string
*/
protected function allowTags($html, $tags_str)
{
$tagsArr = $this->tag($tags_str);
$html = $this->removeTags($html, $tagsArr[1]);
$allow = '';
foreach ($tagsArr[0] as $tag) {
$allow .= "<$tag> ";
}
return strip_tags(trim($html), $allow);
}
protected function tag($tags_str)
{
$tagArr = preg_split("/\s+/", $tags_str, -1, PREG_SPLIT_NO_EMPTY);
$tags = array(array(), array());
foreach ($tagArr as $tag) {
if (preg_match('/-(.+)/', $tag, $arr)) {
array_push($tags[1], $arr[1]);
} else {
array_push($tags[0], $tag);
}
}
return $tags;
}
/**
* 移除特定的html标签
* @param string $html
* @param array $tags 标签数组
* @return string
*/
protected function removeTags($html, $tags)
{
$tag_str = '';
if (count($tags)) {
foreach ($tags as $tag) {
$tag_str .= $tag_str ? ',' . $tag : $tag;
}
// phpQuery::$defaultCharset = $this->inputEncoding?$this->inputEncoding:$this->htmlEncoding;
$doc = phpQuery::newDocumentHTML($html);
pq($doc)->find($tag_str)->remove();
$html = pq($doc)->htmlOuter();
$doc->unloadDocument();
}
return $html;
}
protected function destroyDocument()
{
if ($this->document instanceof phpQueryObject) {
$this->document->unloadDocument();
}
}
public function __destruct()
{
$this->destroyDocument();
}
}

View File

@@ -0,0 +1,15 @@
<?php
/**
* Created by PhpStorm.
* User: Jaeger <JaegerCode@gmail.com>
* Date: 2017/9/21
*/
namespace QL\Exceptions;
use Exception;
class ServiceNotFoundException extends Exception
{
}

74
src/Kernel.php Normal file
View File

@@ -0,0 +1,74 @@
<?php
/**
* Created by PhpStorm.
* User: Jaeger <JaegerCode@gmail.com>
* Date: 2017/9/21
*/
namespace QL;
use QL\Contracts\ServiceProviderContract;
use QL\Exceptions\ServiceNotFoundException;
use QL\Providers\EncodeServiceProvider;
use Closure;
use QL\Providers\HttpServiceProvider;
use QL\Providers\PluginServiceProvider;
use QL\Providers\SystemServiceProvider;
use Tightenco\Collect\Support\Collection;
class Kernel
{
protected $providers = [
SystemServiceProvider::class,
HttpServiceProvider::class,
EncodeServiceProvider::class,
PluginServiceProvider::class
];
protected $binds;
protected $ql;
/**
* Kernel constructor.
* @param $ql
*/
public function __construct(QueryList $ql)
{
$this->ql = $ql;
$this->binds = new Collection();
}
public function bootstrap()
{
//注册服务提供者
$this->registerProviders();
return $this;
}
public function registerProviders()
{
foreach ($this->providers as $provider) {
$this->register(new $provider());
}
}
public function bind(string $name,Closure $provider)
{
$this->binds[$name] = $provider;
}
public function getService(string $name)
{
if(!$this->binds->offsetExists($name)){
throw new ServiceNotFoundException("Service: {$name} not found!");
}
return $this->binds[$name];
}
private function register(ServiceProviderContract $instance)
{
$instance->register($this);
}
}

View File

@@ -0,0 +1,22 @@
<?php
/**
* Created by PhpStorm.
* User: Jaeger <JaegerCode@gmail.com>
* Date: 2017/9/20
*/
namespace QL\Providers;
use QL\Contracts\ServiceProviderContract;
use QL\Kernel;
use QL\Services\EncodeService;
class EncodeServiceProvider implements ServiceProviderContract
{
public function register(Kernel $kernel)
{
$kernel->bind('encoding',function (string $outputEncoding,string $inputEncoding = null){
return EncodeService::convert($this,$outputEncoding,$inputEncoding);
});
}
}

View File

@@ -0,0 +1,40 @@
<?php
/**
* Created by PhpStorm.
* User: Jaeger <JaegerCode@gmail.com>
* Date: 2017/9/22
*/
namespace QL\Providers;
use QL\Contracts\ServiceProviderContract;
use QL\Kernel;
use QL\Services\HttpService;
use QL\Services\MultiRequestService;
class HttpServiceProvider implements ServiceProviderContract
{
public function register(Kernel $kernel)
{
$kernel->bind('get',function (...$args){
return HttpService::get($this,...$args);
});
$kernel->bind('post',function (...$args){
return HttpService::post($this,...$args);
});
$kernel->bind('postJson',function (...$args){
return HttpService::postJson($this,...$args);
});
$kernel->bind('multiGet',function (...$args){
return new MultiRequestService($this,'get',...$args);
});
$kernel->bind('multiPost',function (...$args){
return new MultiRequestService($this,'post',...$args);
});
}
}

View File

@@ -0,0 +1,23 @@
<?php
/**
* Created by PhpStorm.
* User: Jaeger <JaegerCode@gmail.com>
* Date: 2017/9/22
*/
namespace QL\Providers;
use QL\Contracts\ServiceProviderContract;
use QL\Kernel;
use QL\Services\PluginService;
class PluginServiceProvider implements ServiceProviderContract
{
public function register(Kernel $kernel)
{
$kernel->bind('use',function ($plugins,...$opt){
return PluginService::install($this,$plugins,...$opt);
});
}
}

View File

@@ -0,0 +1,32 @@
<?php
/**
* Created by PhpStorm.
* User: Jaeger <JaegerCode@gmail.com>
* Date: 2017/9/22
*/
namespace QL\Providers;
use QL\Contracts\ServiceProviderContract;
use QL\Kernel;
use Closure;
class SystemServiceProvider implements ServiceProviderContract
{
public function register(Kernel $kernel)
{
$kernel->bind('html',function (...$args){
$this->setHtml(...$args);
return $this;
});
$kernel->bind('queryData',function (Closure $callback = null){
return $this->query()->getData($callback)->all();
});
$kernel->bind('pipe',function (Closure $callback = null){
return $callback($this);
});
}
}

133
src/QueryList.php Normal file
View File

@@ -0,0 +1,133 @@
<?php
/**
* QueryList
*
* 一个基于phpQuery的通用列表采集类
*
* @author Jaeger
* @email JaegerCode@gmail.com
* @link https://github.com/jae-jae/QueryList
* @version 4.0.0
*
*/
namespace QL;
use phpQuery;
use QL\Dom\Query;
use Tightenco\Collect\Support\Collection;
use Closure;
use QL\Services\MultiRequestService;
/**
* Class QueryList
* @package QL
*
* @method string getHtml($rel = true)
* @method QueryList setHtml($html)
* @method QueryList html($html)
* @method Dom\Elements find($selector)
* @method QueryList rules(array $rules)
* @method QueryList range($range)
* @method QueryList removeHead()
* @method QueryList query(Closure $callback = null)
* @method Collection getData(Closure $callback = null)
* @method Array queryData(Closure $callback = null)
* @method QueryList setData(Collection $data)
* @method QueryList encoding(string $outputEncoding,string $inputEncoding = null)
* @method QueryList get($url,$args = null,$otherArgs = [])
* @method QueryList post($url,$args = null,$otherArgs = [])
* @method QueryList postJson($url,$args = null,$otherArgs = [])
* @method MultiRequestService multiGet($urls)
* @method MultiRequestService multiPost($urls)
* @method QueryList use($plugins,...$opt)
* @method QueryList pipe(Closure $callback = null)
*/
class QueryList
{
protected $query;
protected $kernel;
protected static $instance = null;
/**
* QueryList constructor.
*/
public function __construct()
{
$this->query = new Query($this);
$this->kernel = (new Kernel($this))->bootstrap();
Config::getInstance()->bootstrap($this);
}
public function __call($name, $arguments)
{
if(method_exists($this->query,$name)){
$result = $this->query->$name(...$arguments);
}else{
$result = $this->kernel->getService($name)->call($this,...$arguments);
}
return $result;
}
public static function __callStatic($name, $arguments)
{
$instance = new self();
return $instance->$name(...$arguments);
}
public function __destruct()
{
$this->destruct();
}
/**
* Get the QueryList single instance
*
* @return QueryList
*/
public static function getInstance()
{
self::$instance || self::$instance = new self();
return self::$instance;
}
/**
* Get the Config instance
* @return null|Config
*/
public static function config()
{
return Config::getInstance();
}
/**
* Destruction of resources
*/
public function destruct()
{
unset($this->query);
unset($this->kernel);
}
/**
* Destroy all documents
*/
public static function destructDocuments()
{
phpQuery::$documents = [];
}
/**
* Bind a custom method to the QueryList object
*
* @param string $name Invoking the name
* @param Closure $provide Called method
* @return $this
*/
public function bind(string $name,Closure $provide)
{
$this->kernel->bind($name,$provide);
return $this;
}
}

View File

@@ -0,0 +1,37 @@
<?php
/**
* Created by PhpStorm.
* User: Jaeger <JaegerCode@gmail.com>
* Date: 2017/9/20
* 编码转换服务
*/
namespace QL\Services;
use QL\QueryList;
class EncodeService
{
public static function convert(QueryList $ql,string $outputEncoding,string $inputEncoding = null)
{
$html = $ql->getHtml();
$inputEncoding || $inputEncoding = self::detect($html);
$html = iconv($inputEncoding,$outputEncoding.'//IGNORE',$html);
$ql->setHtml($html);
return $ql;
}
/**
* Attempts to detect the encoding
* @param $string
* @return bool|false|mixed|string
*/
public static function detect($string)
{
$charset=mb_detect_encoding($string, array('ASCII', 'GB2312', 'GBK', 'UTF-8'),true);
if(strtolower($charset)=='cp936')
$charset='GBK';
return $charset;
}
}

View File

@@ -0,0 +1,59 @@
<?php
/**
* Created by PhpStorm.
* User: Jaeger <JaegerCode@gmail.com>
* Date: 2017/9/22
*/
namespace QL\Services;
use GuzzleHttp\Cookie\CookieJar;
use Jaeger\GHttp;
use QL\QueryList;
class HttpService
{
protected static $cookieJar = null;
public static function getCookieJar()
{
if(self::$cookieJar == null)
{
self::$cookieJar = new CookieJar();
}
return self::$cookieJar;
}
public static function get(QueryList $ql,$url,$args = null,$otherArgs = [])
{
$otherArgs = array_merge([
'cookies' => self::getCookieJar(),
'verify' => false
],$otherArgs);
$html = GHttp::get($url,$args,$otherArgs);
$ql->setHtml($html);
return $ql;
}
public static function post(QueryList $ql,$url,$args = null,$otherArgs = [])
{
$otherArgs = array_merge([
'cookies' => self::getCookieJar(),
'verify' => false
],$otherArgs);
$html = GHttp::post($url,$args,$otherArgs);
$ql->setHtml($html);
return $ql;
}
public static function postJson(QueryList $ql,$url,$args = null,$otherArgs = [])
{
$otherArgs = array_merge([
'cookies' => self::getCookieJar(),
'verify' => false
],$otherArgs);
$html = GHttp::postJson($url,$args,$otherArgs);
$ql->setHtml($html);
return $ql;
}
}

View File

@@ -0,0 +1,66 @@
<?php
/**
* Created by PhpStorm.
* User: Jaeger <JaegerCode@gmail.com>
* Date: 18/12/10
* Time: 下午7:05
*/
namespace QL\Services;
use Jaeger\GHttp;
use Closure;
use GuzzleHttp\Psr7\Response;
use QL\QueryList;
use GuzzleHttp\Exception\RequestException;
/**
* Class MultiRequestService
* @package QL\Services
*
* @method MultiRequestService withHeaders($headers)
* @method MultiRequestService withOptions($options)
* @method MultiRequestService concurrency($concurrency)
*/
class MultiRequestService
{
protected $ql;
protected $multiRequest;
protected $method;
public function __construct(QueryList $ql,$method,$urls)
{
$this->ql = $ql;
$this->method = $method;
$this->multiRequest = GHttp::multiRequest($urls);
}
public function __call($name, $arguments)
{
$this->multiRequest = $this->multiRequest->$name(...$arguments);
return $this;
}
public function success(Closure $success)
{
$this->multiRequest = $this->multiRequest->success(function(Response $response, $index) use($success){
$this->ql->setHtml((String)$response->getBody());
$success($this->ql,$response, $index);
});
return $this;
}
public function error(Closure $error)
{
$this->multiRequest = $this->multiRequest->error(function(RequestException $reason, $index) use($error){
$error($this->ql,$reason, $index);
});
return $this;
}
public function send()
{
$this->multiRequest->{$this->method}();
}
}

View File

@@ -0,0 +1,26 @@
<?php
/**
* Created by PhpStorm.
* User: Jaeger <JaegerCode@gmail.com>
* Date: 2017/9/22
*/
namespace QL\Services;
use QL\QueryList;
class PluginService
{
public static function install(QueryList $queryList, $plugins, ...$opt)
{
if(is_array($plugins))
{
foreach ($plugins as $plugin) {
$plugin::install($queryList);
}
}else{
$plugins::install($queryList,...$opt);
}
return $queryList;
}
}

71
tests/Dom/FindTest.php Normal file
View File

@@ -0,0 +1,71 @@
<?php
/**
* Created by PhpStorm.
* User: x
* Date: 2018/12/10
* Time: 12:46 AM
*/
namespace Tests\Dom;
use QL\QueryList;
use Tests\TestCaseBase;
class FindTest extends TestCaseBase
{
protected $html;
protected $ql;
protected function setUp(): void
{
$this->html = $this->getSnippet('snippet-1');
$this->ql = QueryList::html($this->html);
}
/**
* @test
*/
public function find_first_dom_attr()
{
$img = [];
$img[] = $this->ql->find('img')->attr('src');
$img[] = $this->ql->find('img')->src;
$img[] = $this->ql->find('img:eq(0)')->src;
$img[] = $this->ql->find('img')->eq(0)->src;
$alt = $this->ql->find('img')->alt;
$abc = $this->ql->find('img')->abc;
$this->assertCount(1,array_unique($img));
$this->assertEquals($alt,'这是图片');
$this->assertEquals($abc,'这是一个自定义属性');
}
/**
* @test
*/
public function find_second_dom_attr()
{
$img2 = [];
$img2[] = $this->ql->find('img')->eq(1)->alt;
$img2[] = $this->ql->find('img:eq(1)')->alt;
$img2[] = $this->ql->find('.second_pic')->alt;
$this->assertCount(1,array_unique($img2));
}
/**
* @test
*/
public function find_dom_all_attr()
{
$imgAttr = $this->ql->find('img:eq(0)')->attr('*');
$linkAttr = $this->ql->find('a:eq(1)')->attr('*');
$this->assertCount(3,$imgAttr);
$this->assertCount(1,$linkAttr);
}
}

43
tests/Dom/RulesTest.php Normal file
View File

@@ -0,0 +1,43 @@
<?php
/**
* Created by PhpStorm.
* User: Jaeger <JaegerCode@gmail.com>
* Date: 18/12/12
* Time: 下午12:25
*/
namespace Tests\Dom;
use QL\QueryList;
use Tests\TestCaseBase;
use Tightenco\Collect\Support\Collection;
class RulesTest extends TestCaseBase
{
protected $html;
protected $ql;
protected function setUp(): void
{
$this->html = $this->getSnippet('snippet-2');
$this->ql = QueryList::html($this->html);
}
/**
* @test
*/
public function get_data_by_rules()
{
$rules = [
'a' => ['a','text'],
'img_src' => ['img','src'],
'img_alt' => ['img','alt']
];
$range = 'ul>li';
$data = QueryList::rules($rules)->range($range)->html($this->html)->query()->getData();
$this->assertInstanceOf(Collection::class,$data);
$this->assertCount(3,$data);
$this->assertEquals('http://querylist.com/2.jpg',$data[1]['img_src']);
}
}

103
tests/Feature/HttpTest.php Normal file
View File

@@ -0,0 +1,103 @@
<?php
/**
* Created by PhpStorm.
* User: x
* Date: 2018/12/10
* Time: 12:35 AM
*/
namespace Tests\Feature;
use GuzzleHttp\Handler\MockHandler;
use GuzzleHttp\Psr7\Response;
use QL\QueryList;
use Tests\TestCaseBase;
class HttpTest extends TestCaseBase
{
protected $urls;
protected function setUp(): void
{
$this->urls = [
'http://httpbin.org/get?name=php',
'http://httpbin.org/get?name=golang',
'http://httpbin.org/get?name=c++',
'http://httpbin.org/get?name=java'
];
}
/**
* @test
*/
public function can_post_json_data()
{
$mock = new MockHandler([new Response()]);
$data = [
'name' => 'foo'
];
QueryList::postJson('http://foo.com',$data,[
'handler' => $mock
]);
$this->assertEquals((string)$mock->getLastRequest()->getBody(),json_encode($data));
}
/**
* @test
*/
public function concurrent_requests_base_use()
{
$urls = $this->urls;
QueryList::getInstance()
->multiGet($urls)
->success(function(QueryList $ql,Response $response, $index) use($urls){
$body = json_decode((string)$response->getBody(),true);
$this->assertEquals($urls[$index],$body['url']);
})->send();
}
/**
* @test
*/
public function concurrent_requests_advanced_use()
{
$ua = 'QueryList/4.0';
$errorUrl = 'http://web-site-not-exist.com';
$urls = array_merge($this->urls,[$errorUrl]);
QueryList::rules([])
->multiGet($urls)
->concurrency(2)
->withOptions([
'timeout' => 60
])
->withHeaders([
'User-Agent' => $ua
])
->success(function (QueryList $ql, Response $response, $index) use($ua){
$body = json_decode((string)$response->getBody(),true);
$this->assertEquals($ua,$body['headers']['User-Agent']);
})
->error(function (QueryList $ql, $reason, $index) use($urls,$errorUrl){
$this->assertEquals($urls[$index],$errorUrl);
})
->send();
}
/**
* @test
*/
public function request_with_cache()
{
$url = $this->urls[0];
$data = QueryList::get($url,null,[
'cache' => sys_get_temp_dir(),
'cache_ttl' => 600
])->getHtml();
$data = json_decode($data,true);
$this->assertEquals($url,$data['url']);
}
}

View File

@@ -0,0 +1,48 @@
<?php
/**
* Created by PhpStorm.
* User: x
* Date: 2018/12/9
* Time: 11:10 PM
*/
namespace Tests\Feature;
use QL\QueryList;
use Tests\TestCaseBase;
class InstanceTest extends TestCaseBase
{
protected $html;
protected function setUp(): void
{
$this->html = $this->getSnippet('snippet-1');
}
/**
* @test
*/
public function singleton_instance_mode()
{
$ql = QueryList::getInstance()->html($this->html);
$ql2 = QueryList::getInstance();
$this->assertEquals($ql->getHtml(),$ql2->getHtml());
}
/**
* @test
*/
public function get_new_object()
{
$ql = (new QueryList())->html($this->html);
$ql2 = (new QueryList())->html('');
$this->assertNotEquals($ql->getHtml(),$ql2->getHtml());
$ql = QueryList::range('')->html($this->html);
$ql2 = QueryList::range('')->html('');
$this->assertNotEquals($ql->getHtml(),$ql2->getHtml());
}
}

View File

@@ -0,0 +1,36 @@
<?php
/**
* Created by PhpStorm.
* User: x
* Date: 2018/12/10
* Time: 1:14 AM
*/
namespace Tests\Feature;
use QL\QueryList;
use Tests\TestCaseBase;
class MethodTest extends TestCaseBase
{
protected $html;
protected function setUp(): void
{
$this->html = $this->getSnippet('snippet-1');
}
/**
* @test
*/
public function pipe()
{
$html = $this->html;
$qlHtml = QueryList::pipe(function(QueryList $ql) use($html){
$ql->setHtml($html);
return $ql;
})->getHtml(false);
$this->assertEquals($html,$qlHtml);
}
}

20
tests/TestCaseBase.php Normal file
View File

@@ -0,0 +1,20 @@
<?php
/**
* Created by PhpStorm.
* User: x
* Date: 2018/12/9
* Time: 11:43 PM
*/
namespace Tests;
use PHPUnit\Framework\TestCase;
class TestCaseBase extends TestCase
{
public function getSnippet($name)
{
return file_get_contents(__DIR__.'/assets/'.$name.'.html');
}
}

View File

@@ -0,0 +1,9 @@
<div id="one">
<div class="two">
<a href="http://querylist.cc">QueryList官网</a>
<img src="http://querylist.com/1.jpg" alt="这是图片" abc="这是一个自定义属性">
<img class="second_pic" src="http://querylist.com/2.jpg" alt="这是图片2">
<a href="http://doc.querylist.cc">QueryList文档</a>
</div>
<span>其它的<b>一些</b>文本</span>
</div>

View File

@@ -0,0 +1,16 @@
<div id="one">
<ul>
<li>
<a href="http://querylist.cc">QueryList官网</a>
<img src="http://querylist.com/1.jpg" alt="这是图片1" abc="这是一个自定义属性1">
</li>
<li>
<a href="http://v3.querylist.cc">QueryList V3文档</a>
<img src="http://querylist.com/2.jpg" alt="这是图片2" abc="这是一个自定义属性2">
</li>
<li>
<a href="http://v4.querylist.cc">QueryList V4文档</a>
<img src="http://querylist.com/3.jpg" alt="这是图片3" abc="这是一个自定义属性3">
</li>
</ul>
</div>

5
tests/bootstrap.php Normal file
View File

@@ -0,0 +1,5 @@
<?php
set_time_limit(0);
require __DIR__.'/../vendor/autoload.php';