Compare commits

..

16 Commits

Author SHA1 Message Date
Jaeger(黄杰)
894fb4344e
Merge pull request #145 from maxiaozhi/master
正则匹配成功时才替换掉html
2021-08-08 13:04:55 +08:00
lion
e4fc716acd 正则匹配成功时才替换掉html 2021-07-18 23:37:35 +08:00
Jaeger(黄杰)
39dc0ca9c6
Merge pull request #143 from maxiaozhi/patch-1
Fix the matching exception
2021-07-05 14:07:58 +08:00
maxiaozhi
ef0a2efd4f
Fix the matching exception
Fix the matching exception when the page contains multiple tags prefixed with head (for example: < head >, < header >)
2021-07-05 13:51:24 +08:00
huangjie
5953daac54 update collect 2020-12-14 10:39:28 +08:00
huangjie
465c6aefc7 update collect 2020-09-27 17:41:44 +08:00
Jaeger(黄杰)
92cb319d44
Update README-ZH.md 2020-07-18 13:06:29 +08:00
Jaeger(黄杰)
cbf3e0fcad
Update README.md 2020-07-18 13:05:59 +08:00
Jaeger(黄杰)
cfa2d94a79
Update FUNDING.yml 2020-07-17 13:20:49 +08:00
Jaeger(黄杰)
47a444bf9e
Create FUNDING.yml 2020-07-17 13:08:44 +08:00
Jaeger
85903fa9b5 feat: rules add attrs 2020-04-03 20:16:00 +08:00
Jaeger(黄杰)
e527c637c7
Merge pull request #110 from jae-jae/develop
replace collect()
2020-04-03 04:55:27 -05:00
Jaeger
f0a9798925 replace collect() 2020-04-03 17:33:32 +08:00
Jaeger
faea883c6f fix: data callback 2020-04-01 22:03:50 +08:00
Jaeger
c16826a573 updaed composer dependency 2020-03-23 18:15:04 +08:00
Jaeger
1492751f98 feat: optimization getHtml() 2020-03-22 17:19:57 +08:00
15 changed files with 100 additions and 64 deletions

12
.github/FUNDING.yml vendored Normal file
View File

@ -0,0 +1,12 @@
# These are supported funding model platforms
github: # Replace with up to 4 GitHub Sponsors-enabled usernames e.g., [user1, user2]
patreon: # Replace with a single Patreon username
open_collective: querylist # Replace with a single Open Collective username
ko_fi: # Replace with a single Ko-fi username
tidelift: # Replace with a single Tidelift platform-name/package-name e.g., npm/babel
community_bridge: # Replace with a single Community Bridge project-name e.g., cloud-foundry
liberapay: # Replace with a single Liberapay username
issuehunt: # Replace with a single IssueHunt username
otechie: # Replace with a single Otechie username
custom: # Replace with up to 4 custom sponsorship URLs e.g., ['link1', 'link2']

1
.gitignore vendored
View File

@ -2,3 +2,4 @@
.idea/ .idea/
composer.lock composer.lock
.DS_Store .DS_Store
*.cache

View File

@ -29,7 +29,7 @@
- ..... - .....
## 环境要求 ## 环境要求
- PHP >= 7.0 - PHP >= 7.1
> 如果你的PHP版本还停留在PHP5或者不会使用Composer,你可以选择使用QueryList3,QueryList3支持php5.3以及手动安装。 > 如果你的PHP版本还停留在PHP5或者不会使用Composer,你可以选择使用QueryList3,QueryList3支持php5.3以及手动安装。
QueryList3 文档:http://v3.querylist.cc QueryList3 文档:http://v3.querylist.cc

View File

@ -31,7 +31,7 @@ Through plug-ins you can easily implement things like:
- ..... - .....
## Requirements ## Requirements
- PHP >= 7.0 - PHP >= 7.1
## Installation ## Installation
By Composer installation: By Composer installation:

View File

@ -4,11 +4,11 @@
"keywords":["QueryList","phpQuery","spider"], "keywords":["QueryList","phpQuery","spider"],
"homepage": "http://querylist.cc", "homepage": "http://querylist.cc",
"require": { "require": {
"PHP":">=7.0", "PHP":">=7.1",
"jaeger/phpquery-single": "^1", "jaeger/phpquery-single": "^1",
"tightenco/collect": "^5",
"jaeger/g-http": "^1.1", "jaeger/g-http": "^1.1",
"ext-dom": "*" "ext-dom": "*",
"tightenco/collect": ">5.0"
}, },
"suggest":{ "suggest":{
@ -32,6 +32,9 @@
}, },
"require-dev": { "require-dev": {
"symfony/var-dumper": "^3.3", "symfony/var-dumper": "^3.3",
"phpunit/phpunit": "^7.5" "phpunit/phpunit": "^8.5"
},
"scripts": {
"test": "./vendor/bin/phpunit"
} }
} }

View File

@ -7,6 +7,7 @@
namespace QL; namespace QL;
use Closure; use Closure;
use Tightenco\Collect\Support\Collection;
class Config class Config
{ {
@ -20,8 +21,8 @@ class Config
*/ */
public function __construct() public function __construct()
{ {
$this->plugins = collect(); $this->plugins = new Collection();
$this->binds = collect(); $this->binds = new Collection();
} }

View File

@ -9,6 +9,7 @@ namespace QL\Dom;
use phpDocumentor\Reflection\Types\Null_; use phpDocumentor\Reflection\Types\Null_;
use phpQueryObject; use phpQueryObject;
use Tightenco\Collect\Support\Collection;
/** /**
* Class Elements * Class Elements
@ -191,7 +192,7 @@ class Elements
*/ */
public function map($callback) public function map($callback)
{ {
$collection = collect(); $collection = new Collection();
$this->elements->each(function ($dom) use (& $collection, $callback) { $this->elements->each(function ($dom) use (& $collection, $callback) {
$collection->push($callback(new self(pq($dom)))); $collection->push($callback(new self(pq($dom))));
}); });

View File

@ -35,11 +35,12 @@ class Query
} }
/** /**
* @return mixed * @param bool $rel
* @return String
*/ */
public function getHtml() public function getHtml($rel = true)
{ {
return $this->html; return $rel ? $this->document->htmlOuter() : $this->html;
} }
/** /**
@ -51,7 +52,7 @@ class Query
{ {
$this->html = value($html); $this->html = value($html);
$this->destroyDocument(); $this->destroyDocument();
$this->document = phpQuery::newDocumentHTML($this->html,$charset); $this->document = phpQuery::newDocumentHTML($this->html, $charset);
return $this->ql; return $this->ql;
} }
@ -63,7 +64,7 @@ class Query
*/ */
public function getData(Closure $callback = null) public function getData(Closure $callback = null)
{ {
return is_null($callback) ? $this->data : $this->data->map($callback); return $this->handleData($this->data, $callback);
} }
/** /**
@ -124,8 +125,8 @@ class Query
*/ */
public function removeHead() public function removeHead()
{ {
$html = preg_replace('/<head.+?>.+<\/head>/is','<head></head>',$this->html); $html = preg_replace('/(<head>|<head\s+.+?>).+?<\/head>/is', '<head></head>', $this->html);
$this->setHtml($html); $html && $this->setHtml($html);
return $this->ql; return $this->ql;
} }
@ -138,24 +139,37 @@ class Query
public function query(Closure $callback = null) public function query(Closure $callback = null)
{ {
$this->data = $this->getList(); $this->data = $this->getList();
$callback && $this->data = $this->data->map($callback); $this->data = $this->handleData($this->data, $callback);
return $this->ql; return $this->ql;
} }
public function handleData(Collection $data, $callback)
{
if (is_callable($callback)) {
if (empty($this->range)) {
$data = new Collection($callback($data->all(), null));
} else {
$data = $data->map($callback);
}
}
return $data;
}
protected function getList() protected function getList()
{ {
$data = []; $data = [];
if (empty($this->range)) { if (empty($this->range)) {
foreach ($this->rules as $key => $reg_value){ foreach ($this->rules as $key => $reg_value) {
$rule = $this->parseRule($reg_value); $rule = $this->parseRule($reg_value);
$contentElements = $this->document->find($rule['selector']); $contentElements = $this->document->find($rule['selector']);
$data[$key] = $this->extractContent($contentElements, $key, $rule); $data[$key] = $this->extractContent($contentElements, $key, $rule);
} }
} else { } else {
$rangeElements = $this->document->find($this->range); $rangeElements = $this->document->find($this->range);
$i = 0; $i = 0;
foreach ($rangeElements as $element) { foreach ($rangeElements as $element) {
foreach ($this->rules as $key => $reg_value){ foreach ($this->rules as $key => $reg_value) {
$rule = $this->parseRule($reg_value); $rule = $this->parseRule($reg_value);
$contentElements = pq($element)->find($rule['selector']); $contentElements = pq($element)->find($rule['selector']);
$data[$i][$key] = $this->extractContent($contentElements, $key, $rule); $data[$i][$key] = $this->extractContent($contentElements, $key, $rule);
@ -164,7 +178,7 @@ class Query
} }
} }
return collect($data); return new Collection($data);
} }
protected function extractContent(phpQueryObject $pqObj, $ruleName, $rule) protected function extractContent(phpQueryObject $pqObj, $ruleName, $rule)
@ -174,7 +188,7 @@ class Query
$content = $this->allowTags($pqObj->html(), $rule['filter_tags']); $content = $this->allowTags($pqObj->html(), $rule['filter_tags']);
break; break;
case 'texts': case 'texts':
$content = (new Elements($pqObj))->map(function(Elements $element) use($rule){ $content = (new Elements($pqObj))->map(function (Elements $element) use ($rule) {
return $this->allowTags($element->html(), $rule['filter_tags']); return $this->allowTags($element->html(), $rule['filter_tags']);
})->all(); })->all();
break; break;
@ -182,7 +196,7 @@ class Query
$content = $this->stripTags($pqObj->html(), $rule['filter_tags']); $content = $this->stripTags($pqObj->html(), $rule['filter_tags']);
break; break;
case 'htmls': case 'htmls':
$content = (new Elements($pqObj))->map(function(Elements $element) use($rule){ $content = (new Elements($pqObj))->map(function (Elements $element) use ($rule) {
return $this->stripTags($element->html(), $rule['filter_tags']); return $this->stripTags($element->html(), $rule['filter_tags']);
})->all(); })->all();
break; break;
@ -190,16 +204,22 @@ class Query
$content = $this->stripTags($pqObj->htmlOuter(), $rule['filter_tags']); $content = $this->stripTags($pqObj->htmlOuter(), $rule['filter_tags']);
break; break;
case 'htmlOuters': case 'htmlOuters':
$content = (new Elements($pqObj))->map(function(Elements $element) use($rule){ $content = (new Elements($pqObj))->map(function (Elements $element) use ($rule) {
return $this->stripTags($element->htmlOuter(), $rule['filter_tags']); return $this->stripTags($element->htmlOuter(), $rule['filter_tags']);
})->all(); })->all();
break; break;
default: default:
$content = $pqObj->attr($rule['attr']); if(preg_match('/attr\((.+)\)/', $rule['attr'], $arr)) {
$content = $pqObj->attr($arr[1]);
} elseif (preg_match('/attrs\((.+)\)/', $rule['attr'], $arr)) {
$content = (new Elements($pqObj))->attrs($arr[1])->all();
} else {
$content = $pqObj->attr($rule['attr']);
}
break; break;
} }
if(is_callable($rule['handle_callback'])){ if (is_callable($rule['handle_callback'])) {
$content = call_user_func($rule['handle_callback'], $content, $ruleName); $content = call_user_func($rule['handle_callback'], $content, $ruleName);
} }
@ -219,49 +239,47 @@ class Query
/** /**
* 去除特定的html标签 * 去除特定的html标签
* @param string $html * @param string $html
* @param string $tags_str 多个标签名之间用空格隔开 * @param string $tags_str 多个标签名之间用空格隔开
* @return string * @return string
*/ */
protected function stripTags($html,$tags_str) protected function stripTags($html, $tags_str)
{ {
$tagsArr = $this->tag($tags_str); $tagsArr = $this->tag($tags_str);
$html = $this->removeTags($html,$tagsArr[1]); $html = $this->removeTags($html, $tagsArr[1]);
$p = array(); $p = array();
foreach ($tagsArr[0] as $tag) { foreach ($tagsArr[0] as $tag) {
$p[]="/(<(?:\/".$tag."|".$tag.")[^>]*>)/i"; $p[] = "/(<(?:\/" . $tag . "|" . $tag . ")[^>]*>)/i";
} }
$html = preg_replace($p,"",trim($html)); $html = preg_replace($p, "", trim($html));
return $html; return $html;
} }
/** /**
* 保留特定的html标签 * 保留特定的html标签
* @param string $html * @param string $html
* @param string $tags_str 多个标签名之间用空格隔开 * @param string $tags_str 多个标签名之间用空格隔开
* @return string * @return string
*/ */
protected function allowTags($html,$tags_str) protected function allowTags($html, $tags_str)
{ {
$tagsArr = $this->tag($tags_str); $tagsArr = $this->tag($tags_str);
$html = $this->removeTags($html,$tagsArr[1]); $html = $this->removeTags($html, $tagsArr[1]);
$allow = ''; $allow = '';
foreach ($tagsArr[0] as $tag) { foreach ($tagsArr[0] as $tag) {
$allow .= "<$tag> "; $allow .= "<$tag> ";
} }
return strip_tags(trim($html),$allow); return strip_tags(trim($html), $allow);
} }
protected function tag($tags_str) protected function tag($tags_str)
{ {
$tagArr = preg_split("/\s+/",$tags_str,-1,PREG_SPLIT_NO_EMPTY); $tagArr = preg_split("/\s+/", $tags_str, -1, PREG_SPLIT_NO_EMPTY);
$tags = array(array(),array()); $tags = array(array(), array());
foreach($tagArr as $tag) foreach ($tagArr as $tag) {
{ if (preg_match('/-(.+)/', $tag, $arr)) {
if(preg_match('/-(.+)/', $tag,$arr))
{
array_push($tags[1], $arr[1]); array_push($tags[1], $arr[1]);
}else{ } else {
array_push($tags[0], $tag); array_push($tags[0], $tag);
} }
} }
@ -270,17 +288,16 @@ class Query
/** /**
* 移除特定的html标签 * 移除特定的html标签
* @param string $html * @param string $html
* @param array $tags 标签数组 * @param array $tags 标签数组
* @return string * @return string
*/ */
protected function removeTags($html,$tags) protected function removeTags($html, $tags)
{ {
$tag_str = ''; $tag_str = '';
if(count($tags)) if (count($tags)) {
{
foreach ($tags as $tag) { foreach ($tags as $tag) {
$tag_str .= $tag_str?','.$tag:$tag; $tag_str .= $tag_str ? ',' . $tag : $tag;
} }
// phpQuery::$defaultCharset = $this->inputEncoding?$this->inputEncoding:$this->htmlEncoding; // phpQuery::$defaultCharset = $this->inputEncoding?$this->inputEncoding:$this->htmlEncoding;
$doc = phpQuery::newDocumentHTML($html); $doc = phpQuery::newDocumentHTML($html);
@ -293,7 +310,7 @@ class Query
protected function destroyDocument() protected function destroyDocument()
{ {
if($this->document instanceof phpQueryObject) { if ($this->document instanceof phpQueryObject) {
$this->document->unloadDocument(); $this->document->unloadDocument();
} }
} }

View File

@ -14,6 +14,7 @@ use Closure;
use QL\Providers\HttpServiceProvider; use QL\Providers\HttpServiceProvider;
use QL\Providers\PluginServiceProvider; use QL\Providers\PluginServiceProvider;
use QL\Providers\SystemServiceProvider; use QL\Providers\SystemServiceProvider;
use Tightenco\Collect\Support\Collection;
class Kernel class Kernel
{ {
@ -34,7 +35,7 @@ class Kernel
public function __construct(QueryList $ql) public function __construct(QueryList $ql)
{ {
$this->ql = $ql; $this->ql = $ql;
$this->binds = collect(); $this->binds = new Collection();
} }
public function bootstrap() public function bootstrap()

View File

@ -23,7 +23,7 @@ use QL\Services\MultiRequestService;
* Class QueryList * Class QueryList
* @package QL * @package QL
* *
* @method string getHtml() * @method string getHtml($rel = true)
* @method QueryList setHtml($html) * @method QueryList setHtml($html)
* @method QueryList html($html) * @method QueryList html($html)
* @method Dom\Elements find($selector) * @method Dom\Elements find($selector)

View File

@ -17,7 +17,7 @@ class FindTest extends TestCaseBase
protected $html; protected $html;
protected $ql; protected $ql;
public function setUp() protected function setUp(): void
{ {
$this->html = $this->getSnippet('snippet-1'); $this->html = $this->getSnippet('snippet-1');
$this->ql = QueryList::html($this->html); $this->ql = QueryList::html($this->html);

View File

@ -18,7 +18,7 @@ class RulesTest extends TestCaseBase
protected $html; protected $html;
protected $ql; protected $ql;
public function setUp() protected function setUp(): void
{ {
$this->html = $this->getSnippet('snippet-2'); $this->html = $this->getSnippet('snippet-2');
$this->ql = QueryList::html($this->html); $this->ql = QueryList::html($this->html);

View File

@ -18,7 +18,7 @@ class HttpTest extends TestCaseBase
{ {
protected $urls; protected $urls;
public function setUp() protected function setUp(): void
{ {
$this->urls = [ $this->urls = [
'http://httpbin.org/get?name=php', 'http://httpbin.org/get?name=php',

View File

@ -16,7 +16,7 @@ class InstanceTest extends TestCaseBase
{ {
protected $html; protected $html;
public function setUp() protected function setUp(): void
{ {
$this->html = $this->getSnippet('snippet-1'); $this->html = $this->getSnippet('snippet-1');
} }
@ -38,11 +38,11 @@ class InstanceTest extends TestCaseBase
public function get_new_object() public function get_new_object()
{ {
$ql = (new QueryList())->html($this->html); $ql = (new QueryList())->html($this->html);
$ql2 = new QueryList(); $ql2 = (new QueryList())->html('');
$this->assertNotEquals($ql->getHtml(),$ql2->getHtml()); $this->assertNotEquals($ql->getHtml(),$ql2->getHtml());
$ql = QueryList::range('')->html($this->html); $ql = QueryList::range('')->html($this->html);
$ql2 = QueryList::range(''); $ql2 = QueryList::range('')->html('');
$this->assertNotEquals($ql->getHtml(),$ql2->getHtml()); $this->assertNotEquals($ql->getHtml(),$ql2->getHtml());
} }
} }

View File

@ -16,7 +16,7 @@ class MethodTest extends TestCaseBase
{ {
protected $html; protected $html;
public function setUp() protected function setUp(): void
{ {
$this->html = $this->getSnippet('snippet-1'); $this->html = $this->getSnippet('snippet-1');
} }
@ -30,7 +30,7 @@ class MethodTest extends TestCaseBase
$qlHtml = QueryList::pipe(function(QueryList $ql) use($html){ $qlHtml = QueryList::pipe(function(QueryList $ql) use($html){
$ql->setHtml($html); $ql->setHtml($html);
return $ql; return $ql;
})->getHtml(); })->getHtml(false);
$this->assertEquals($html,$qlHtml); $this->assertEquals($html,$qlHtml);
} }
} }