* Date: 2017/9/21 */ namespace QL\Dom; use Tightenco\Collect\Support\Collection; use phpQuery; use QL\QueryList; use Closure; class Query { protected $html; /** * @var \phpQueryObject */ protected $document; protected $rules; protected $range = null; protected $ql; /** * @var Collection */ protected $data; public function __construct(QueryList $ql) { $this->ql = $ql; } /** * @return mixed */ public function getHtml() { return $this->html; } /** * @param $html * @param null $charset * @return QueryList */ public function setHtml($html, $charset = null) { $this->html = value($html); if($this->document){ $this->destroyDocument(); } $this->document = phpQuery::newDocumentHTML($this->html,$charset); return $this->ql; } /** * Get crawl results * * @param Closure|null $callback * @return Collection|static */ public function getData(Closure $callback = null) { return is_null($callback) ? $this->data : $this->data->map($callback); } /** * @param Collection $data */ public function setData(Collection $data) { $this->data = $data; } /** * Searches for all elements that match the specified expression. * * @param $selector A string containing a selector expression to match elements against. * @return Elements */ public function find($selector) { return (new Dom($this->document))->find($selector); } /** * Set crawl rule * * $rules = [ * 'rule_name1' => ['selector','HTML attribute | text | html','Tag filter list','callback'], * 'rule_name2' => ['selector','HTML attribute | text | html','Tag filter list','callback'], * // ... * ] * * @param array $rules * @return QueryList */ public function rules(array $rules) { $this->rules = $rules; return $this->ql; } /** * Set the slice area for crawl list * * @param $selector * @return QueryList */ public function range($selector) { $this->range = $selector; return $this->ql; } /** * Remove HTML head,try to solve the garbled * * @return QueryList */ public function removeHead() { $html = preg_replace('/.+<\/head>/is','',$this->html); $this->setHtml($html); return $this->ql; } /** * Execute the query rule * * @param Closure|null $callback * @return QueryList */ public function query(Closure $callback = null) { $this->data = $this->getList(); $callback && $this->data = $this->data->map($callback); return $this->ql; } protected function getList() { $data = []; if (!empty($this->range)) { $robj = $this->document->find($this->range); $i = 0; foreach ($robj as $item) { foreach ($this->rules as $key => $reg_value){ $tags = $reg_value[2] ?? ''; $iobj = pq($item,$this->document)->find($reg_value[0]); switch ($reg_value[1]) { case 'text': $data[$i][$key] = $this->allowTags(pq($iobj)->html(),$tags); break; case 'html': $data[$i][$key] = $this->stripTags(pq($iobj)->html(),$tags); break; default: $data[$i][$key] = pq($iobj)->attr($reg_value[1]); break; } if(isset($reg_value[3])){ $data[$i][$key] = call_user_func($reg_value[3],$data[$i][$key],$key); } } $i++; } } else { foreach ($this->rules as $key => $reg_value){ $tags = $reg_value[2] ?? ''; $lobj = $this->document->find($reg_value[0]); $i = 0; foreach ($lobj as $item) { switch ($reg_value[1]) { case 'text': $data[$i][$key] = $this->allowTags(pq($item,$this->document)->html(),$tags); break; case 'html': $data[$i][$key] = $this->stripTags(pq($item,$this->document)->html(),$tags); break; default: $data[$i][$key] = pq($item,$this->document)->attr($reg_value[1]); break; } if(isset($reg_value[3])){ $data[$i][$key] = call_user_func($reg_value[3],$data[$i][$key],$key); } $i++; } } } // phpQuery::$documents = array(); return collect($data); } /** * 去除特定的html标签 * @param string $html * @param string $tags_str 多个标签名之间用空格隔开 * @return string */ protected function stripTags($html,$tags_str) { $tagsArr = $this->tag($tags_str); $html = $this->removeTags($html,$tagsArr[1]); $p = array(); foreach ($tagsArr[0] as $tag) { $p[]="/(<(?:\/".$tag."|".$tag.")[^>]*>)/i"; } $html = preg_replace($p,"",trim($html)); return $html; } /** * 保留特定的html标签 * @param string $html * @param string $tags_str 多个标签名之间用空格隔开 * @return string */ protected function allowTags($html,$tags_str) { $tagsArr = $this->tag($tags_str); $html = $this->removeTags($html,$tagsArr[1]); $allow = ''; foreach ($tagsArr[0] as $tag) { $allow .= "<$tag> "; } return strip_tags(trim($html),$allow); } protected function tag($tags_str) { $tagArr = preg_split("/\s+/",$tags_str,-1,PREG_SPLIT_NO_EMPTY); $tags = array(array(),array()); foreach($tagArr as $tag) { if(preg_match('/-(.+)/', $tag,$arr)) { array_push($tags[1], $arr[1]); }else{ array_push($tags[0], $tag); } } return $tags; } /** * 移除特定的html标签 * @param string $html * @param array $tags 标签数组 * @return string */ protected function removeTags($html,$tags) { $tag_str = ''; if(count($tags)) { foreach ($tags as $tag) { $tag_str .= $tag_str?','.$tag:$tag; } // phpQuery::$defaultCharset = $this->inputEncoding?$this->inputEncoding:$this->htmlEncoding; $doc = phpQuery::newDocumentHTML($html); pq($doc)->find($tag_str)->remove(); $html = pq($doc)->htmlOuter(); $doc->unloadDocument(); } return $html; } protected function destroyDocument() { $this->document->unloadDocument(); unset($this->document); } public function __destruct() { $this->destroyDocument(); } }