Compare commits

..

16 Commits

Author SHA1 Message Date
Jaeger(黄杰)
894fb4344e
Merge pull request #145 from maxiaozhi/master
正则匹配成功时才替换掉html
2021-08-08 13:04:55 +08:00
lion
e4fc716acd 正则匹配成功时才替换掉html 2021-07-18 23:37:35 +08:00
Jaeger(黄杰)
39dc0ca9c6
Merge pull request #143 from maxiaozhi/patch-1
Fix the matching exception
2021-07-05 14:07:58 +08:00
maxiaozhi
ef0a2efd4f
Fix the matching exception
Fix the matching exception when the page contains multiple tags prefixed with head (for example: < head >, < header >)
2021-07-05 13:51:24 +08:00
huangjie
5953daac54 update collect 2020-12-14 10:39:28 +08:00
huangjie
465c6aefc7 update collect 2020-09-27 17:41:44 +08:00
Jaeger(黄杰)
92cb319d44
Update README-ZH.md 2020-07-18 13:06:29 +08:00
Jaeger(黄杰)
cbf3e0fcad
Update README.md 2020-07-18 13:05:59 +08:00
Jaeger(黄杰)
cfa2d94a79
Update FUNDING.yml 2020-07-17 13:20:49 +08:00
Jaeger(黄杰)
47a444bf9e
Create FUNDING.yml 2020-07-17 13:08:44 +08:00
Jaeger
85903fa9b5 feat: rules add attrs 2020-04-03 20:16:00 +08:00
Jaeger(黄杰)
e527c637c7
Merge pull request #110 from jae-jae/develop
replace collect()
2020-04-03 04:55:27 -05:00
Jaeger
f0a9798925 replace collect() 2020-04-03 17:33:32 +08:00
Jaeger
faea883c6f fix: data callback 2020-04-01 22:03:50 +08:00
Jaeger
c16826a573 updaed composer dependency 2020-03-23 18:15:04 +08:00
Jaeger
1492751f98 feat: optimization getHtml() 2020-03-22 17:19:57 +08:00
15 changed files with 100 additions and 64 deletions

12
.github/FUNDING.yml vendored Normal file
View File

@ -0,0 +1,12 @@
# These are supported funding model platforms
github: # Replace with up to 4 GitHub Sponsors-enabled usernames e.g., [user1, user2]
patreon: # Replace with a single Patreon username
open_collective: querylist # Replace with a single Open Collective username
ko_fi: # Replace with a single Ko-fi username
tidelift: # Replace with a single Tidelift platform-name/package-name e.g., npm/babel
community_bridge: # Replace with a single Community Bridge project-name e.g., cloud-foundry
liberapay: # Replace with a single Liberapay username
issuehunt: # Replace with a single IssueHunt username
otechie: # Replace with a single Otechie username
custom: # Replace with up to 4 custom sponsorship URLs e.g., ['link1', 'link2']

3
.gitignore vendored
View File

@ -1,4 +1,5 @@
/vendor/ /vendor/
.idea/ .idea/
composer.lock composer.lock
.DS_Store .DS_Store
*.cache

View File

@ -29,7 +29,7 @@
- ..... - .....
## 环境要求 ## 环境要求
- PHP >= 7.0 - PHP >= 7.1
> 如果你的PHP版本还停留在PHP5或者不会使用Composer,你可以选择使用QueryList3,QueryList3支持php5.3以及手动安装。 > 如果你的PHP版本还停留在PHP5或者不会使用Composer,你可以选择使用QueryList3,QueryList3支持php5.3以及手动安装。
QueryList3 文档:http://v3.querylist.cc QueryList3 文档:http://v3.querylist.cc
@ -306,4 +306,4 @@ $ql->curlMulti([
Jaeger <JaegerCode@gmail.com> Jaeger <JaegerCode@gmail.com>
## Lisence ## Lisence
QueryList is licensed under the license of MIT. See the LICENSE for more details. QueryList is licensed under the license of MIT. See the LICENSE for more details.

View File

@ -31,7 +31,7 @@ Through plug-ins you can easily implement things like:
- ..... - .....
## Requirements ## Requirements
- PHP >= 7.0 - PHP >= 7.1
## Installation ## Installation
By Composer installation: By Composer installation:
@ -301,4 +301,4 @@ Jaeger <JaegerCode@gmail.com>
If this library is useful for you, say thanks [buying me a beer :beer:](https://www.paypal.me/jaepay)! If this library is useful for you, say thanks [buying me a beer :beer:](https://www.paypal.me/jaepay)!
## Lisence ## Lisence
QueryList is licensed under the license of MIT. See the LICENSE for more details. QueryList is licensed under the license of MIT. See the LICENSE for more details.

View File

@ -4,11 +4,11 @@
"keywords":["QueryList","phpQuery","spider"], "keywords":["QueryList","phpQuery","spider"],
"homepage": "http://querylist.cc", "homepage": "http://querylist.cc",
"require": { "require": {
"PHP":">=7.0", "PHP":">=7.1",
"jaeger/phpquery-single": "^1", "jaeger/phpquery-single": "^1",
"tightenco/collect": "^5",
"jaeger/g-http": "^1.1", "jaeger/g-http": "^1.1",
"ext-dom": "*" "ext-dom": "*",
"tightenco/collect": ">5.0"
}, },
"suggest":{ "suggest":{
@ -32,6 +32,9 @@
}, },
"require-dev": { "require-dev": {
"symfony/var-dumper": "^3.3", "symfony/var-dumper": "^3.3",
"phpunit/phpunit": "^7.5" "phpunit/phpunit": "^8.5"
},
"scripts": {
"test": "./vendor/bin/phpunit"
} }
} }

View File

@ -7,6 +7,7 @@
namespace QL; namespace QL;
use Closure; use Closure;
use Tightenco\Collect\Support\Collection;
class Config class Config
{ {
@ -20,8 +21,8 @@ class Config
*/ */
public function __construct() public function __construct()
{ {
$this->plugins = collect(); $this->plugins = new Collection();
$this->binds = collect(); $this->binds = new Collection();
} }

View File

@ -9,6 +9,7 @@ namespace QL\Dom;
use phpDocumentor\Reflection\Types\Null_; use phpDocumentor\Reflection\Types\Null_;
use phpQueryObject; use phpQueryObject;
use Tightenco\Collect\Support\Collection;
/** /**
* Class Elements * Class Elements
@ -191,7 +192,7 @@ class Elements
*/ */
public function map($callback) public function map($callback)
{ {
$collection = collect(); $collection = new Collection();
$this->elements->each(function ($dom) use (& $collection, $callback) { $this->elements->each(function ($dom) use (& $collection, $callback) {
$collection->push($callback(new self(pq($dom)))); $collection->push($callback(new self(pq($dom))));
}); });

View File

@ -35,11 +35,12 @@ class Query
} }
/** /**
* @return mixed * @param bool $rel
* @return String
*/ */
public function getHtml() public function getHtml($rel = true)
{ {
return $this->html; return $rel ? $this->document->htmlOuter() : $this->html;
} }
/** /**
@ -51,7 +52,7 @@ class Query
{ {
$this->html = value($html); $this->html = value($html);
$this->destroyDocument(); $this->destroyDocument();
$this->document = phpQuery::newDocumentHTML($this->html,$charset); $this->document = phpQuery::newDocumentHTML($this->html, $charset);
return $this->ql; return $this->ql;
} }
@ -63,7 +64,7 @@ class Query
*/ */
public function getData(Closure $callback = null) public function getData(Closure $callback = null)
{ {
return is_null($callback) ? $this->data : $this->data->map($callback); return $this->handleData($this->data, $callback);
} }
/** /**
@ -124,8 +125,8 @@ class Query
*/ */
public function removeHead() public function removeHead()
{ {
$html = preg_replace('/<head.+?>.+<\/head>/is','<head></head>',$this->html); $html = preg_replace('/(<head>|<head\s+.+?>).+?<\/head>/is', '<head></head>', $this->html);
$this->setHtml($html); $html && $this->setHtml($html);
return $this->ql; return $this->ql;
} }
@ -138,24 +139,37 @@ class Query
public function query(Closure $callback = null) public function query(Closure $callback = null)
{ {
$this->data = $this->getList(); $this->data = $this->getList();
$callback && $this->data = $this->data->map($callback); $this->data = $this->handleData($this->data, $callback);
return $this->ql; return $this->ql;
} }
public function handleData(Collection $data, $callback)
{
if (is_callable($callback)) {
if (empty($this->range)) {
$data = new Collection($callback($data->all(), null));
} else {
$data = $data->map($callback);
}
}
return $data;
}
protected function getList() protected function getList()
{ {
$data = []; $data = [];
if (empty($this->range)) { if (empty($this->range)) {
foreach ($this->rules as $key => $reg_value){ foreach ($this->rules as $key => $reg_value) {
$rule = $this->parseRule($reg_value); $rule = $this->parseRule($reg_value);
$contentElements = $this->document->find($rule['selector']); $contentElements = $this->document->find($rule['selector']);
$data[$key] = $this->extractContent($contentElements, $key, $rule); $data[$key] = $this->extractContent($contentElements, $key, $rule);
} }
} else { } else {
$rangeElements = $this->document->find($this->range); $rangeElements = $this->document->find($this->range);
$i = 0; $i = 0;
foreach ($rangeElements as $element) { foreach ($rangeElements as $element) {
foreach ($this->rules as $key => $reg_value){ foreach ($this->rules as $key => $reg_value) {
$rule = $this->parseRule($reg_value); $rule = $this->parseRule($reg_value);
$contentElements = pq($element)->find($rule['selector']); $contentElements = pq($element)->find($rule['selector']);
$data[$i][$key] = $this->extractContent($contentElements, $key, $rule); $data[$i][$key] = $this->extractContent($contentElements, $key, $rule);
@ -164,7 +178,7 @@ class Query
} }
} }
return collect($data); return new Collection($data);
} }
protected function extractContent(phpQueryObject $pqObj, $ruleName, $rule) protected function extractContent(phpQueryObject $pqObj, $ruleName, $rule)
@ -174,7 +188,7 @@ class Query
$content = $this->allowTags($pqObj->html(), $rule['filter_tags']); $content = $this->allowTags($pqObj->html(), $rule['filter_tags']);
break; break;
case 'texts': case 'texts':
$content = (new Elements($pqObj))->map(function(Elements $element) use($rule){ $content = (new Elements($pqObj))->map(function (Elements $element) use ($rule) {
return $this->allowTags($element->html(), $rule['filter_tags']); return $this->allowTags($element->html(), $rule['filter_tags']);
})->all(); })->all();
break; break;
@ -182,7 +196,7 @@ class Query
$content = $this->stripTags($pqObj->html(), $rule['filter_tags']); $content = $this->stripTags($pqObj->html(), $rule['filter_tags']);
break; break;
case 'htmls': case 'htmls':
$content = (new Elements($pqObj))->map(function(Elements $element) use($rule){ $content = (new Elements($pqObj))->map(function (Elements $element) use ($rule) {
return $this->stripTags($element->html(), $rule['filter_tags']); return $this->stripTags($element->html(), $rule['filter_tags']);
})->all(); })->all();
break; break;
@ -190,16 +204,22 @@ class Query
$content = $this->stripTags($pqObj->htmlOuter(), $rule['filter_tags']); $content = $this->stripTags($pqObj->htmlOuter(), $rule['filter_tags']);
break; break;
case 'htmlOuters': case 'htmlOuters':
$content = (new Elements($pqObj))->map(function(Elements $element) use($rule){ $content = (new Elements($pqObj))->map(function (Elements $element) use ($rule) {
return $this->stripTags($element->htmlOuter(), $rule['filter_tags']); return $this->stripTags($element->htmlOuter(), $rule['filter_tags']);
})->all(); })->all();
break; break;
default: default:
$content = $pqObj->attr($rule['attr']); if(preg_match('/attr\((.+)\)/', $rule['attr'], $arr)) {
$content = $pqObj->attr($arr[1]);
} elseif (preg_match('/attrs\((.+)\)/', $rule['attr'], $arr)) {
$content = (new Elements($pqObj))->attrs($arr[1])->all();
} else {
$content = $pqObj->attr($rule['attr']);
}
break; break;
} }
if(is_callable($rule['handle_callback'])){ if (is_callable($rule['handle_callback'])) {
$content = call_user_func($rule['handle_callback'], $content, $ruleName); $content = call_user_func($rule['handle_callback'], $content, $ruleName);
} }
@ -219,49 +239,47 @@ class Query
/** /**
* 去除特定的html标签 * 去除特定的html标签
* @param string $html * @param string $html
* @param string $tags_str 多个标签名之间用空格隔开 * @param string $tags_str 多个标签名之间用空格隔开
* @return string * @return string
*/ */
protected function stripTags($html,$tags_str) protected function stripTags($html, $tags_str)
{ {
$tagsArr = $this->tag($tags_str); $tagsArr = $this->tag($tags_str);
$html = $this->removeTags($html,$tagsArr[1]); $html = $this->removeTags($html, $tagsArr[1]);
$p = array(); $p = array();
foreach ($tagsArr[0] as $tag) { foreach ($tagsArr[0] as $tag) {
$p[]="/(<(?:\/".$tag."|".$tag.")[^>]*>)/i"; $p[] = "/(<(?:\/" . $tag . "|" . $tag . ")[^>]*>)/i";
} }
$html = preg_replace($p,"",trim($html)); $html = preg_replace($p, "", trim($html));
return $html; return $html;
} }
/** /**
* 保留特定的html标签 * 保留特定的html标签
* @param string $html * @param string $html
* @param string $tags_str 多个标签名之间用空格隔开 * @param string $tags_str 多个标签名之间用空格隔开
* @return string * @return string
*/ */
protected function allowTags($html,$tags_str) protected function allowTags($html, $tags_str)
{ {
$tagsArr = $this->tag($tags_str); $tagsArr = $this->tag($tags_str);
$html = $this->removeTags($html,$tagsArr[1]); $html = $this->removeTags($html, $tagsArr[1]);
$allow = ''; $allow = '';
foreach ($tagsArr[0] as $tag) { foreach ($tagsArr[0] as $tag) {
$allow .= "<$tag> "; $allow .= "<$tag> ";
} }
return strip_tags(trim($html),$allow); return strip_tags(trim($html), $allow);
} }
protected function tag($tags_str) protected function tag($tags_str)
{ {
$tagArr = preg_split("/\s+/",$tags_str,-1,PREG_SPLIT_NO_EMPTY); $tagArr = preg_split("/\s+/", $tags_str, -1, PREG_SPLIT_NO_EMPTY);
$tags = array(array(),array()); $tags = array(array(), array());
foreach($tagArr as $tag) foreach ($tagArr as $tag) {
{ if (preg_match('/-(.+)/', $tag, $arr)) {
if(preg_match('/-(.+)/', $tag,$arr))
{
array_push($tags[1], $arr[1]); array_push($tags[1], $arr[1]);
}else{ } else {
array_push($tags[0], $tag); array_push($tags[0], $tag);
} }
} }
@ -270,17 +288,16 @@ class Query
/** /**
* 移除特定的html标签 * 移除特定的html标签
* @param string $html * @param string $html
* @param array $tags 标签数组 * @param array $tags 标签数组
* @return string * @return string
*/ */
protected function removeTags($html,$tags) protected function removeTags($html, $tags)
{ {
$tag_str = ''; $tag_str = '';
if(count($tags)) if (count($tags)) {
{
foreach ($tags as $tag) { foreach ($tags as $tag) {
$tag_str .= $tag_str?','.$tag:$tag; $tag_str .= $tag_str ? ',' . $tag : $tag;
} }
// phpQuery::$defaultCharset = $this->inputEncoding?$this->inputEncoding:$this->htmlEncoding; // phpQuery::$defaultCharset = $this->inputEncoding?$this->inputEncoding:$this->htmlEncoding;
$doc = phpQuery::newDocumentHTML($html); $doc = phpQuery::newDocumentHTML($html);
@ -293,7 +310,7 @@ class Query
protected function destroyDocument() protected function destroyDocument()
{ {
if($this->document instanceof phpQueryObject) { if ($this->document instanceof phpQueryObject) {
$this->document->unloadDocument(); $this->document->unloadDocument();
} }
} }

View File

@ -14,6 +14,7 @@ use Closure;
use QL\Providers\HttpServiceProvider; use QL\Providers\HttpServiceProvider;
use QL\Providers\PluginServiceProvider; use QL\Providers\PluginServiceProvider;
use QL\Providers\SystemServiceProvider; use QL\Providers\SystemServiceProvider;
use Tightenco\Collect\Support\Collection;
class Kernel class Kernel
{ {
@ -34,7 +35,7 @@ class Kernel
public function __construct(QueryList $ql) public function __construct(QueryList $ql)
{ {
$this->ql = $ql; $this->ql = $ql;
$this->binds = collect(); $this->binds = new Collection();
} }
public function bootstrap() public function bootstrap()

View File

@ -23,7 +23,7 @@ use QL\Services\MultiRequestService;
* Class QueryList * Class QueryList
* @package QL * @package QL
* *
* @method string getHtml() * @method string getHtml($rel = true)
* @method QueryList setHtml($html) * @method QueryList setHtml($html)
* @method QueryList html($html) * @method QueryList html($html)
* @method Dom\Elements find($selector) * @method Dom\Elements find($selector)

View File

@ -17,7 +17,7 @@ class FindTest extends TestCaseBase
protected $html; protected $html;
protected $ql; protected $ql;
public function setUp() protected function setUp(): void
{ {
$this->html = $this->getSnippet('snippet-1'); $this->html = $this->getSnippet('snippet-1');
$this->ql = QueryList::html($this->html); $this->ql = QueryList::html($this->html);

View File

@ -18,7 +18,7 @@ class RulesTest extends TestCaseBase
protected $html; protected $html;
protected $ql; protected $ql;
public function setUp() protected function setUp(): void
{ {
$this->html = $this->getSnippet('snippet-2'); $this->html = $this->getSnippet('snippet-2');
$this->ql = QueryList::html($this->html); $this->ql = QueryList::html($this->html);

View File

@ -18,7 +18,7 @@ class HttpTest extends TestCaseBase
{ {
protected $urls; protected $urls;
public function setUp() protected function setUp(): void
{ {
$this->urls = [ $this->urls = [
'http://httpbin.org/get?name=php', 'http://httpbin.org/get?name=php',

View File

@ -16,7 +16,7 @@ class InstanceTest extends TestCaseBase
{ {
protected $html; protected $html;
public function setUp() protected function setUp(): void
{ {
$this->html = $this->getSnippet('snippet-1'); $this->html = $this->getSnippet('snippet-1');
} }
@ -38,11 +38,11 @@ class InstanceTest extends TestCaseBase
public function get_new_object() public function get_new_object()
{ {
$ql = (new QueryList())->html($this->html); $ql = (new QueryList())->html($this->html);
$ql2 = new QueryList(); $ql2 = (new QueryList())->html('');
$this->assertNotEquals($ql->getHtml(),$ql2->getHtml()); $this->assertNotEquals($ql->getHtml(),$ql2->getHtml());
$ql = QueryList::range('')->html($this->html); $ql = QueryList::range('')->html($this->html);
$ql2 = QueryList::range(''); $ql2 = QueryList::range('')->html('');
$this->assertNotEquals($ql->getHtml(),$ql2->getHtml()); $this->assertNotEquals($ql->getHtml(),$ql2->getHtml());
} }
} }

View File

@ -16,7 +16,7 @@ class MethodTest extends TestCaseBase
{ {
protected $html; protected $html;
public function setUp() protected function setUp(): void
{ {
$this->html = $this->getSnippet('snippet-1'); $this->html = $this->getSnippet('snippet-1');
} }
@ -30,7 +30,7 @@ class MethodTest extends TestCaseBase
$qlHtml = QueryList::pipe(function(QueryList $ql) use($html){ $qlHtml = QueryList::pipe(function(QueryList $ql) use($html){
$ql->setHtml($html); $ql->setHtml($html);
return $ql; return $ql;
})->getHtml(); })->getHtml(false);
$this->assertEquals($html,$qlHtml); $this->assertEquals($html,$qlHtml);
} }
} }