add composer

This commit is contained in:
JAE 2015-12-22 18:10:14 +08:00
parent 825ebc546a
commit b820f2bd53
5 changed files with 18 additions and 7334 deletions

1
.gitignore vendored Normal file
View File

@ -0,0 +1 @@
/vendor/

17
composer.json Normal file
View File

@ -0,0 +1,17 @@
{
"name": "jaeger/querylist",
"description": "QueryList是基于phpQuery的无比强大的PHP采集工具",
"require": {
"PHP":">=5.3.0",
"jaeger/phpquery-single": "^0.9.5",
"jaeger/curlmulti":"^1.0",
"jaeger/http":"^0.1"
},
"license": "MIT",
"authors": [
{
"name": "Jaeger",
"email": "hj.q@qq.com"
}
]
}

View File

@ -1,698 +0,0 @@
<?php
/**
* Chrome Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.47 Safari/536.11
* IE6 Mozilla/5.0 (Windows NT 6.1; rv:9.0.1) Gecko/20100101 Firefox/9.0.1
* FF Mozilla/5.0 (Windows NT 6.1; WOW64; rv:24.0) Gecko/20100101 Firefox/24.0
*
* more useragent:http://www.useragentstring.com/
*
* @author admin@phpdr.net
*
*/
class CurlMulti {
// url
const TASK_ITEM_URL = 0x01;
// file
const TASK_ITEM_FILE = 0x02;
// arguments
const TASK_ITEM_ARGS = 0x03;
// operation, task level
const TASK_ITEM_OPT = 0x04;
// control options
const TASK_ITEM_CTL = 0x05;
// file pointer
const TASK_FP = 0x06;
// success callback
const TASK_PROCESS = 0x07;
// curl fail callback
const TASK_FAIL = 0x08;
// tryed times
const TASK_TRYED = 0x09;
// handler
const TASK_CH = 0x0A;
// global max thread num
public $maxThread = 10;
// Max thread by task type.Task type is specified in $item['ctl'] in add().If task has no type,$this->maxThreadNoType is maxThread-sum(maxThreadType).If less than 0 $this->maxThreadNoType is set to 0.
public $maxThreadType = array ();
// retry time(s) when task failed
public $maxTry = 3;
// operation, class level curl opt
public $opt = array ();
// cache options,dirLevel values is less than 3
public $cache = array (
'enable' => false,
'enableDownload' => false,
'compress' => false,
'dir' => null,
'expire' => 86400,
'dirLevel' => 1
);
// stack or queue
public $taskPoolType = 'stack';
// eliminate duplicate for taskpool, will delete previous task and add new one
public $taskOverride = false;
// task callback,add() should be called in callback, $cbTask[0] is callback, $cbTask[1] is param.
public $cbTask = null;
// status callback
public $cbInfo = null;
// user callback
public $cbUser = null;
// common fail callback, called if no one specified
public $cbFail = null;
// is the loop running
protected $isRunning = false;
// max thread num no type
protected $maxThreadNoType = null;
// all added task was saved here first
protected $taskPool = array ();
// taskPool with high priority
protected $taskPoolAhead = array ();
// running task(s)
protected $taskRunning = array ();
// failed task need to retry
protected $taskFail = array ();
// handle of multi-thread curl
private $mh = null;
// user error
private $userError = null;
// if __construct called
private $isConstructCalled = false;
// running info
private $info = array (
'all' => array (
// process start time
'startTime' => null,
// download start time
'startTimeDownload' => null,
// the real multi-thread num
'activeNum' => null,
// finished task in the queue
'queueNum' => null,
// byte
'downloadSize' => 0,
// finished task number,include failed task and cache
'finishNum' => 0,
// The number of cache used
'cacheNum' => 0,
// completely failed task number
'failNum' => 0,
// task num has added
'taskNum' => 0,
// task running num by type,
'taskRunningNumType' => array (),
// task ruuning num no type
'taskRunningNumNoType' => 0,
// $this->taskPool size
'taskPoolNum' => 0,
// $this->taskRunning size
'taskRunningNum' => 0,
// $this->taskFail size
'taskFailNum' => 0,
// finish percent
'finishPercent' => 0,
// time cost
'timeSpent' => 0,
// download time cost
'timeSpentDownload' => 0,
// curl task speed
'taskSpeedNoCache' => 0,
// network speed, bytes
'downloadSpeed' => 0
),
'running' => array ()
);
function __construct() {
$this->isConstructCalled = true;
if (version_compare ( PHP_VERSION, '5.1.0' ) < 0) {
throw new CurlMulti_Exception ( 'PHP 5.1.0+ is needed' );
}
}
/**
* add a task to taskPool
*
* @param array $item
* array('url'=>'',['file'=>'',['opt'=>array(),['args'=>array(),['ctl'=>array('type'=>'','ahead'=>false,'cache'=>array('enable'=>bool,'expire'=>0),'close'=>true))]]]])
* @param mixed $process
* success callback,for callback first param array('info'=>,'content'=>), second param $item[args]
* @param mixed $fail
* curl fail callback,for callback first param array('error'=>array(0=>code,1=>msg),'info'=>array),second param $item[args];
* @throws CurlMulti_Exception
* @return \frame\lib\CurlMulti
*/
function add(array $item, $process = null, $fail = null) {
// check
if (! is_array ( $item )) {
user_error ( 'item must be array, item is ' . gettype ( $item ), E_USER_WARNING );
} else {
$item ['url'] = trim ( $item ['url'] );
if (empty ( $item ['url'] )) {
user_error ( "url can't be empty, url=$item[url]", E_USER_WARNING );
} else {
// replace space with + to avoid some curl problems
$item ['url'] = str_replace ( ' ', '+', $item ['url'] );
// fix
if (empty ( $item ['file'] ))
$item ['file'] = null;
if (empty ( $item ['opt'] ))
$item ['opt'] = array ();
if (empty ( $item ['args'] ))
$item ['args'] = array ();
if (empty ( $item ['ctl'] )) {
$item ['ctl'] = array ();
}
if (! isset ( $item ['ctl'] ['cache'] ) || ! isset ( $item ['ctl'] ['cache'] ['enable'] )) {
$item ['ctl'] ['cache'] = array (
'enable' => false,
'expire' => 0
);
}
if (! isset ( $item ['ctl'] ['ahead'] )) {
$item ['ctl'] ['ahead'] = false;
}
if (empty ( $process )) {
$process = null;
}
if (empty ( $fail )) {
$fail = null;
}
$task = array ();
$task [self::TASK_ITEM_URL] = $item ['url'];
$task [self::TASK_ITEM_FILE] = $item ['file'];
$task [self::TASK_ITEM_ARGS] = array (
$item ['args']
);
$task [self::TASK_ITEM_OPT] = $item ['opt'];
$task [self::TASK_ITEM_CTL] = $item ['ctl'];
$task [self::TASK_PROCESS] = $process;
$task [self::TASK_FAIL] = $fail;
$task [self::TASK_TRYED] = 0;
$task [self::TASK_CH] = null;
$this->addTaskPool ( $task );
$this->info ['all'] ['taskNum'] ++;
}
}
return $this;
}
/**
* add task to taskPool
*
* @param unknown $task
*/
private function addTaskPool($task) {
// uniq
if ($this->taskOverride) {
foreach ( array (
'taskPoolAhead',
'taskPool'
) as $v ) {
foreach ( $this->$v as $k1 => $v1 ) {
if ($v1 [self::TASK_ITEM_URL] == $task [self::TASK_ITEM_URL]) {
$t = &$this->$v;
unset ( $t [$k1] );
}
}
}
}
// add
if (true == $task [self::TASK_ITEM_CTL] ['ahead']) {
$this->taskPoolAhead [] = $task;
} else {
if ($this->taskPoolType == 'queue') {
$this->taskPool [] = $task;
} elseif ($this->taskPoolType == 'stack') {
array_unshift ( $this->taskPool, $task );
} else {
throw new CurlMulti_Exception ( 'taskPoolType not found, taskPoolType=' . $this->taskPoolType );
}
}
}
/**
* Perform the actual task(s).
*/
function start() {
if ($this->isRunning) {
throw new CurlMulti_Exception ( __CLASS__ . ' is running !' );
}
if (false === $this->isConstructCalled) {
throw new CurlMulti_Exception ( __CLASS__ . ' __construct is not called' );
}
$this->mh = curl_multi_init ();
$this->info ['all'] ['startTime'] = time ();
$this->info ['all'] ['timeStartDownload'] = null;
$this->info ['all'] ['downloadSize'] = 0;
$this->info ['all'] ['finishNum'] = 0;
$this->info ['all'] ['cacheNum'] = 0;
$this->info ['all'] ['failNum'] = 0;
$this->info ['all'] ['taskNum'] = 0;
$this->info ['all'] ['taskRunningNumNoType'] = 0;
$this->setThreadData ();
$this->isRunning = true;
$this->addTask ();
do {
$this->exec ();
curl_multi_select ( $this->mh );
$this->callCbInfo ();
if (isset ( $this->cbUser )) {
call_user_func ( $this->cbUser );
}
while ( false != ($curlInfo = curl_multi_info_read ( $this->mh, $this->info ['all'] ['queueNum'] )) ) {
$ch = $curlInfo ['handle'];
$task = $this->taskRunning [( int ) $ch];
$info = curl_getinfo ( $ch );
$this->info ['all'] ['downloadSize'] += $info ['size_download'];
if (isset ( $task [self::TASK_FP] )) {
fclose ( $task [self::TASK_FP] );
}
if ($curlInfo ['result'] == CURLE_OK) {
$param = array ();
$param ['info'] = $info;
$param ['ext'] = array (
'ch' => $ch
);
if (! isset ( $task [self::TASK_ITEM_FILE] )) {
$param ['content'] = curl_multi_getcontent ( $ch );
}
}
curl_multi_remove_handle ( $this->mh, $ch );
// must close first,other wise download may be not commpleted in process callback
if (! array_key_exists ( 'close', $task [self::TASK_ITEM_CTL] ) || $task [self::TASK_ITEM_CTL] ['close'] == true) {
curl_close ( $ch );
}
if ($curlInfo ['result'] == CURLE_OK) {
$this->process ( $task, $param, false );
}
// error handle
$callFail = false;
if ($curlInfo ['result'] !== CURLE_OK || isset ( $this->userError )) {
if ($task [self::TASK_TRYED] >= $this->maxTry) {
// user error
if (isset ( $this->userError )) {
$err = array (
'error' => $this->userError
);
} else {
$err = array (
'error' => array (
$curlInfo ['result'],
curl_error ( $ch )
)
);
}
$err ['info'] = $info;
if (isset ( $task [self::TASK_FAIL] ) || isset ( $this->cbFail )) {
array_unshift ( $task [self::TASK_ITEM_ARGS], $err );
$callFail = true;
} else {
echo "\nError " . implode ( ', ', $err ['error'] ) . ", url=$info[url]\n";
}
$this->info ['all'] ['failNum'] ++;
} else {
$task [self::TASK_TRYED] ++;
$task [self::TASK_ITEM_CTL] ['useCache'] = false;
$this->taskFail [] = $task;
$this->info ['all'] ['taskNum'] ++;
}
if (isset ( $this->userError )) {
unset ( $this->userError );
}
}
if ($callFail) {
if (isset ( $task [self::TASK_FAIL] )) {
call_user_func_array ( $task [self::TASK_FAIL], $task [self::TASK_ITEM_ARGS] );
} elseif (isset ( $this->cbFail )) {
call_user_func_array ( $this->cbFail, $task [self::TASK_ITEM_ARGS] );
}
}
unset ( $this->taskRunning [( int ) $ch] );
if (array_key_exists ( 'type', $task [self::TASK_ITEM_CTL] )) {
$this->info ['all'] ['taskRunningNumType'] [$task [self::TASK_ITEM_CTL] ['type']] --;
} else {
$this->info ['all'] ['taskRunningNumNoType'] --;
}
$this->addTask ();
$this->info ['all'] ['finishNum'] ++;
// if $this->info['all']['queueNum'] grow very fast there will be no efficiency lost,because outer $this->exec() won't be executed.
$this->exec ();
$this->callCbInfo ();
if (isset ( $this->cbUser )) {
call_user_func ( $this->cbUser );
}
}
} while ( $this->info ['all'] ['activeNum'] || $this->info ['all'] ['queueNum'] || ! empty ( $this->taskFail ) || ! empty ( $this->taskRunning ) || ! empty ( $this->taskPool ) );
$this->callCbInfo ( true );
curl_multi_close ( $this->mh );
unset ( $this->mh );
$this->isRunning = false;
}
/**
* call $this->cbInfo
*/
private function callCbInfo($force = false) {
static $lastTime;
if (! isset ( $lastTime )) {
$lastTime = time ();
}
$now = time ();
if (($force || $now - $lastTime > 0) && isset ( $this->cbInfo )) {
$lastTime = $now;
$this->info ['all'] ['taskPoolNum'] = count ( $this->taskPool );
$this->info ['all'] ['taskRunningNum'] = count ( $this->taskRunning );
$this->info ['all'] ['taskFailNum'] = count ( $this->taskFail );
if ($this->info ['all'] ['taskNum'] > 0) {
$this->info ['all'] ['finishPercent'] = round ( $this->info ['all'] ['finishNum'] / $this->info ['all'] ['taskNum'], 4 );
}
$this->info ['all'] ['timeSpent'] = time () - $this->info ['all'] ['startTime'];
if (isset ( $this->info ['all'] ['timeStartDownload'] )) {
$this->info ['all'] ['timeSpentDownload'] = time () - $this->info ['all'] ['timeStartDownload'];
}
if ($this->info ['all'] ['timeSpentDownload'] > 0) {
$this->info ['all'] ['taskSpeedNoCache'] = round ( ($this->info ['all'] ['finishNum'] - $this->info ['all'] ['cacheNum']) / $this->info ['all'] ['timeSpentDownload'], 2 );
$this->info ['all'] ['downloadSpeed'] = round ( $this->info ['all'] ['downloadSize'] / $this->info ['all'] ['timeSpentDownload'], 2 );
}
// running
$this->info ['running'] = array ();
foreach ( $this->taskRunning as $k => $v ) {
$this->info ['running'] [$k] = curl_getinfo ( $v [self::TASK_CH] );
}
call_user_func_array ( $this->cbInfo, array (
$this->info
) );
}
}
/**
* set $this->maxThreadNoType, $this->info['all']['taskRunningNumType'], $this->info['all']['taskRunningNumNoType'] etc
*/
private function setThreadData() {
$this->maxThreadNoType = $this->maxThread - array_sum ( $this->maxThreadType );
if ($this->maxThreadNoType < 0) {
$this->maxThreadNoType = 0;
}
// unset none exitst type num
foreach ( $this->info ['all'] ['taskRunningNumType'] as $k => $v ) {
if ($v == 0 && ! array_key_exists ( $k, $this->maxThreadType )) {
unset ( $this->info ['all'] ['taskRunningNumType'] [$k] );
}
}
// init type num
foreach ( $this->maxThreadType as $k => $v ) {
if ($v == 0) {
user_error ( 'maxThreadType[' . $k . '] is 0, task of this type will never be added!', E_USER_WARNING );
}
if (! array_key_exists ( $k, $this->info ['all'] ['taskRunningNumType'] )) {
$this->info ['all'] ['taskRunningNumType'] [$k] = 0;
}
}
}
/**
* curl_multi_exec()
*/
private function exec() {
while ( curl_multi_exec ( $this->mh, $this->info ['all'] ['activeNum'] ) === CURLM_CALL_MULTI_PERFORM ) {
}
}
/**
* add a task to curl, keep $this->maxThread concurrent automatically
*/
private function addTask() {
$c = $this->maxThread - count ( $this->taskRunning );
while ( $c > 0 ) {
$task = array ();
// search failed first
if (! empty ( $this->taskFail )) {
$task = array_pop ( $this->taskFail );
} else {
// cbTask
if (0 < ($this->maxThread - count ( $this->taskPool )) and ! empty ( $this->cbTask )) {
if (! isset ( $this->cbTask [1] )) {
$this->cbTask [1] = array ();
}
call_user_func_array ( $this->cbTask [0], array (
$this->cbTask [1]
) );
}
if (! empty ( $this->taskPoolAhead )) {
$task = array_pop ( $this->taskPoolAhead );
} elseif (! empty ( $this->taskPool )) {
if ($this->taskPoolType == 'stack') {
$task = array_pop ( $this->taskPool );
} elseif ($this->taskPoolType == 'queue') {
$task = array_shift ( $this->taskPool );
} else {
throw new CurlMulti_Exception ( 'taskPoolType not found, taskPoolType=' . $this->taskPoolType );
}
}
}
$noAdd = false;
$cache = null;
if (! empty ( $task )) {
if (true == $task [self::TASK_ITEM_CTL] ['cache'] ['enable'] || $this->cache ['enable']) {
$cache = $this->cache ( $task );
if (null !== $cache) {
if (isset ( $task [self::TASK_ITEM_FILE] )) {
file_put_contents ( $task [self::TASK_ITEM_FILE], $cache ['content'], LOCK_EX );
unset ( $cache ['content'] );
}
$this->process ( $task, $cache, true );
$this->info ['all'] ['cacheNum'] ++;
$this->info ['all'] ['finishNum'] ++;
$this->callCbInfo ();
}
}
if (! $cache) {
$this->setThreadData ();
if (array_key_exists ( 'type', $task [self::TASK_ITEM_CTL] ) && ! array_key_exists ( $task [self::TASK_ITEM_CTL] ['type'], $this->maxThreadType )) {
user_error ( 'task was set to notype because type was not set in $this->maxThreadType, type=' . $task [self::TASK_ITEM_CTL] ['type'], E_USER_WARNING );
unset ( $task [self::TASK_ITEM_CTL] ['type'] );
}
if (array_key_exists ( 'type', $task [self::TASK_ITEM_CTL] )) {
$maxThread = $this->maxThreadType [$task [self::TASK_ITEM_CTL] ['type']];
$isNoType = false;
} else {
$maxThread = $this->maxThreadNoType;
$isNoType = true;
}
if ($isNoType && $maxThread == 0) {
user_error ( 'task was disgarded because maxThreadNoType=0, url=' . $task [self::TASK_ITEM_URL], E_USER_WARNING );
}
if (($isNoType && $this->info ['all'] ['taskRunningNumNoType'] < $maxThread) || (! $isNoType && $this->info ['all'] ['taskRunningNumType'] [$task [self::TASK_ITEM_CTL] ['type']] < $maxThread)) {
$task [self::TASK_CH] = $this->curlInit ( $task [self::TASK_ITEM_URL] );
// is a download task?
if (isset ( $task [self::TASK_ITEM_FILE] )) {
// curl can create the last level directory
$dir = dirname ( $task [self::TASK_ITEM_FILE] );
if (! file_exists ( $dir ))
mkdir ( $dir, 0777 );
$task [self::TASK_FP] = fopen ( $task [self::TASK_ITEM_FILE], 'w' );
curl_setopt ( $task [self::TASK_CH], CURLOPT_FILE, $task [self::TASK_FP] );
}
// single task curl option
if (isset ( $task [self::TASK_ITEM_OPT] )) {
foreach ( $task [self::TASK_ITEM_OPT] as $k => $v ) {
curl_setopt ( $task [self::TASK_CH], $k, $v );
}
}
$this->taskRunning [( int ) $task [self::TASK_CH]] = $task;
if (! isset ( $this->info ['all'] ['timeStartDownload'] )) {
$this->info ['all'] ['timeStartDownload'] = time ();
}
if ($isNoType) {
$this->info ['all'] ['taskRunningNumNoType'] ++;
} else {
$this->info ['all'] ['taskRunningNumType'] [$task [self::TASK_ITEM_CTL] ['type']] ++;
}
curl_multi_add_handle ( $this->mh, $task [self::TASK_CH] );
} else {
// rotate task to pool
if ($task [self::TASK_TRYED] > 0) {
array_unshift ( $this->taskFail, $task );
} else {
array_unshift ( $this->taskPool, $task );
}
$noAdd = true;
}
}
}
if (! $cache || $noAdd) {
$c --;
}
}
}
/**
* do process
*
* @param unknown $task
* @param unknown $r
* @param unknown $isCache
*/
private function process($task, $r, $isCache) {
array_unshift ( $task [self::TASK_ITEM_ARGS], $r );
if (isset ( $task [self::TASK_PROCESS] )) {
$userRes = call_user_func_array ( $task [self::TASK_PROCESS], $task [self::TASK_ITEM_ARGS] );
}
if (! isset ( $userRes )) {
$userRes = true;
}
array_shift ( $task [self::TASK_ITEM_ARGS] );
// backoff
if (false === $userRes) {
if (false == $this->cache ['enable'] && false == $task [self::TASK_ITEM_CTL] ['cache'] ['enable']) {
$task [self::TASK_ITEM_CTL] ['cache'] = array (
'enable' => true,
'expire' => 3600
);
}
$this->addTaskPool ( $task );
}
// write cache
if (false == $isCache && false == isset ( $this->userError ) && (true == $task [self::TASK_ITEM_CTL] ['cache'] ['enable']) || $this->cache ['enable']) {
$this->cache ( $task, $r );
}
}
/**
* set or get file cache
*
* @param string $url
* @param mixed $content
* array('info','content')
* @return return array|null|boolean
*/
private function cache($task, $content = null) {
if (! isset ( $this->cache ['dir'] ))
throw new CurlMulti_Exception ( 'Cache dir is not defined' );
$url = $task [self::TASK_ITEM_URL];
$key = md5 ( $url );
$isDownload = isset ( $task [self::TASK_ITEM_FILE] );
$file = rtrim ( $this->cache ['dir'], '/' ) . '/';
if (isset ( $this->cache ['dirLevel'] ) && $this->cache ['dirLevel'] != 0) {
if ($this->cache ['dirLevel'] == 1) {
$file .= substr ( $key, 0, 3 ) . '/' . substr ( $key, 3 );
} elseif ($this->cache ['dirLevel'] == 2) {
$file .= substr ( $key, 0, 3 ) . '/' . substr ( $key, 3, 3 ) . '/' . substr ( $key, 6 );
} else {
throw new CurlMulti_Exception ( 'cache dirLevel is invalid, dirLevel=' . $this->cache ['dirLevel'] );
}
} else {
$file .= $key;
}
$r = null;
if (! isset ( $content )) {
if (file_exists ( $file )) {
if (true == $task [self::TASK_ITEM_CTL] ['cache'] ['enable']) {
$expire = $task [self::TASK_ITEM_CTL] ['cache'] ['expire'];
} else {
$expire = $this->cache ['expire'];
}
if (time () - filemtime ( $file ) < $expire) {
$r = file_get_contents ( $file );
if ($this->cache ['compress']) {
$r = gzuncompress ( $r );
}
$r = unserialize ( $r );
if ($isDownload) {
$r ['content'] = base64_decode ( $r ['content'] );
}
}
}
} else {
$r = false;
// check main cache directory
if (! is_dir ( $this->cache ['dir'] )) {
throw new CurlMulti_Exception ( "Cache dir doesn't exists" );
} else {
$dir = dirname ( $file );
// level 1 subdir
if (isset ( $this->cache ['dirLevel'] ) && $this->cache ['dirLevel'] > 1) {
$dir1 = dirname ( $dir );
if (! is_dir ( $dir1 ) && ! mkdir ( $dir1 )) {
throw new CurlMulti_Exception ( 'Create dir failed, dir=' . $dir1 );
}
}
if (! is_dir ( $dir ) && ! mkdir ( $dir )) {
throw new CurlMulti_Exception ( 'Create dir failed, dir=' . $dir );
}
if ($isDownload) {
$content ['content'] = base64_encode ( file_get_contents ( $task [self::TASK_ITEM_FILE] ) );
}
$content = serialize ( $content );
if ($this->cache ['compress']) {
$content = gzcompress ( $content );
}
if (file_put_contents ( $file, $content, LOCK_EX )) {
$r = true;
} else {
throw new CurlMulti_Exception ( 'Write cache file failed' );
}
}
}
return $r;
}
/**
* user error for current callback
* not curl error
* must be called in process callback
*
* @param unknown $msg
*/
function error($msg) {
$this->userError = array (
CURLE_OK,
$msg
);
}
/**
* return a default $ch initialized with global opt
*
* @param unknown $url
* @return resource
*/
function getch($url = null) {
return $this->curlInit ( $url );
}
/**
* get curl handle
*
* @param string $url
* @return resource
*/
private function curlInit($url = null) {
$ch = curl_init ();
$opt = array ();
if (isset ( $url )) {
$opt [CURLOPT_URL] = $url;
}
$opt [CURLOPT_HEADER] = false;
$opt [CURLOPT_CONNECTTIMEOUT] = 10;
$opt [CURLOPT_TIMEOUT] = 30;
$opt [CURLOPT_AUTOREFERER] = true;
$opt [CURLOPT_USERAGENT] = 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.47 Safari/536.11';
$opt [CURLOPT_RETURNTRANSFER] = true;
$opt [CURLOPT_FOLLOWLOCATION] = true;
$opt [CURLOPT_MAXREDIRS] = 10;
// user defined opt
if (! empty ( $this->opt )) {
foreach ( $this->opt as $k => $v ) {
$opt [$k] = $v;
}
}
curl_setopt_array ( $ch, $opt );
return $ch;
}
}
class CurlMulti_Exception extends Exception {
}

View File

@ -1,934 +0,0 @@
<?php
/**
*
* @desc HTTP 请求类, 支持 CURL Socket, 默认使用 CURL , 当手动指定
* useCurl 或者 curl 扩展没有安装时, 会使用 Socket
* 目前支持 get post 两种请求方式
*
* @example
*
1. 基本 get 请求:
$http = new Http(); // 实例化对象
$result = $http->get('http://weibo.com/at/comment');
2. 基本 post 请求:
$http = new Http(); // 实例化对象
$result = $http->post('http://someurl.com/post-new-article', array('title'=>$title, 'body'=>$body) );
3. 模拟登录 ( post get 同时使用, 利用 cookie 存储状态 ) :
$http = new Http(); // 实例化对象
$http->setCookiepath(substr(md5($username), 0, 10)); // 设置 cookie, 如果是多个用户请求的话
// 提交 post 数据
$loginData = $http->post('http://login.sina.com.cn/sso/login.php?client=ssologin.js(v1.3.19)', array('username'=>$username, 'loginPass'=>$password) );
$result = $http->get('http://weibo.com/at/comment');
4. 利用 initialize 函数设置多个 config 信息
$httpConfig['method'] = 'GET';
$httpConfig['target'] = 'http://www.somedomain.com/index.html';
$httpConfig['referrer'] = 'http://www.somedomain.com';
$httpConfig['user_agent'] = 'My Crawler';
$httpConfig['timeout'] = '30';
$httpConfig['params'] = array('var1' => 'testvalue', 'var2' => 'somevalue');
$http = new Http();
$http->initialize($httpConfig);
$result = $http->result;
5. 复杂的设置:
$http = new Http();
$http->useCurl(false); // 不使用 curl
$http->setMethod('POST'); // 使用 POST method
// 设置 POST 数据
$http->addParam('user_name' , 'yourusername');
$http->addParam('password' , 'yourpassword');
// Referrer
$http->setReferrer('https://yourproject.projectpath.com/login');
// 开始执行请求
$http->execute('https://yourproject.projectpath.com/login/authenticate');
$result = $http->getResult();
6. 获取开启了 basic auth 的请求
$http = new Http();
// Set HTTP basic authentication realms
$http->setAuth('yourusername', 'yourpassword');
// 获取某个被保护的应用的 feed
$http->get('http://www.someblog.com/protected/feed.xml');
$result = $http->result;
*
* @from http://www.phpfour.com/lib/http
* @since Version 0.1
* @original author Md Emran Hasan <phpfour@gmail.com>
* @modify by Charlie Jade
*/
class Http
{
/** 目标请求 @var string */
var $target;
/** 目标 URL 的 host @var string */
var $host;
/** 请求目标的端口 @var integer */
var $port;
/** 请求目标的 path @var string */
var $path;
/** 请求目标的 schema @var string */
var $schema;
/** 请求的 method (GET 或者 POST) @var string */
var $method;
/** 请求的数据 @var array */
var $params;
/** 请求时候的 cookie 数据 @var array */
var $cookies;
/** 请求返回的 cookie 数据 @var array */
var $_cookies;
/** 请求超时时间, 默认是 25 @var integer */
var $timeout;
/** 是否使用 cURL , 默认为 TRUE @var boolean */
var $useCurl;
/** referrer 信息 @var string */
var $referrer;
/** 请求客户端 User agent @var string */
var $userAgent;
/** Contains the cookie path (to be used with cURL) @var string */
var $cookiePath;
/** 是否使用 Cookie @var boolean */
var $useCookie;
/** 是否为下一次请求保存 Cookie @var boolean */
var $saveCookie;
/** HTTP Basic Auth 用户名 (for authentication) @var string */
var $username;
/** HTTP Basic Auth 密码 (for authentication) @var string */
var $password;
/** 请求的结果集 @var string */
var $result;
/** 最后一个请求的 headers 信息 @var array */
var $headers;
/** Contains the last call's http status code @var string */
var $status;
/** 是否跟随 http redirect 跳转 @var boolean */
var $redirect;
/** 最大 http redirect 调整数 @var integer */
var $maxRedirect;
/** 当前请求有多少个 URL @var integer */
var $curRedirect;
/** 错误代码 @var string */
var $error;
/** Store the next token @var string */
var $nextToken;
/** 是否存储 bug 信息 @var boolean */
var $debug;
/** Stores the debug messages @var array @todo will keep debug messages */
var $debugMsg;
/** Constructor for initializing the class with default values. @return void */
public function __construct()
{
// 先初始化
$this->clear();
}
/**
* 初始化配置信息
* Initialize preferences
*
* This function will take an associative array of config values and
* will initialize the class variables using them.
*
* Example use:
*
* <pre>
* $httpConfig['method'] = 'GET';
* $httpConfig['target'] = 'http://www.somedomain.com/index.html';
* $httpConfig['referrer'] = 'http://www.somedomain.com';
* $httpConfig['user_agent'] = 'My Crawler';
* $httpConfig['timeout'] = '30';
* $httpConfig['params'] = array('var1' => 'testvalue', 'var2' => 'somevalue');
*
* $http = new Http();
* $http->initialize($httpConfig);
* </pre>
*
* @param array Config values as associative array
* @return void
*/
public function initialize($config = array())
{
$this->clear();
foreach ($config as $key => $val)
{
if (isset($this->$key))
{
$method = 'set' . ucfirst(str_replace('_', '', $key));
if (method_exists($this, $method))
{
$this->$method($val);
}
else
{
$this->$key = $val;
}
}
}
}
/**
* 初始化所有
*
* Clears all the properties of the class and sets the object to
* the beginning state. Very handy if you are doing subsequent calls
* with different data.
*
* @return void
*/
public function clear()
{
// Set the request defaults
$this->host = '';
$this->port = 0;
$this->path = '';
$this->target = '';
$this->method = 'GET';
$this->schema = 'http';
$this->params = array();
$this->headers = array();
$this->cookies = array();
$this->_cookies = array();
// Set the config details
$this->debug = FALSE;
$this->error = '';
$this->status = 0;
$this->timeout = '25';
$this->useCurl = TRUE;
$this->referrer = '';
$this->username = '';
$this->password = '';
$this->redirect = TRUE;
// Set the cookie and agent defaults
$this->nextToken = '';
$this->useCookie = FALSE;
$this->saveCookie = FALSE;
$this->maxRedirect = 3;
$this->cookiePath = 'cookie.txt';
$this->userAgent = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.7 (KHTML, like Gecko) Chrome/16.0.912.63 Safari/535.7';
}
/** 设置目标 @return void */
public function setTarget($url)
{
$this->target = $url;
}
/** 设置 http 请求方法 @param string HTTP method to use (GET or POST) @return void */
public function setMethod($method)
{
$this->method = $method;
}
/** 设置 referrer URL @param string URL of referrer page @return void */
public function setReferrer($referrer)
{
$this->referrer = $referrer;
}
/** 设置 User agent @param string Full user agent string @return void */
public function setUseragent($agent)
{
$this->userAgent = $agent;
}
/** 设置请求 timeout @param integer Timeout delay in seconds @return void */
public function setTimeout($seconds)
{
$this->timeout = $seconds;
}
/** 设置 cookie path (只支持cURL ) @param string File location of cookiejar @return void */
public function setCookiepath($path)
{
$this->cookiePath = $path;
$this->useCookie(TRUE);
$this->saveCookie(TRUE);
}
/** 设置请求参数 parameters @param array GET or POST 的请求数据 @return void */
public function setParams($dataArray)
{
$this->params = array_merge($this->params, $dataArray);
}
/** 设置 basic http auth 域验证 @param string 用户名 @param string 密码 @return void */
public function setAuth($username, $password)
{
$this->username = $username;
$this->password = $password;
}
/** 设置最大跳转数 @param integer Maximum number of redirects @return void */
public function setMaxredirect($value)
{
$this->maxRedirect = $value;
}
/** 添加多一个新的请求数据 @param string Name of the parameter @param string Value of the paramete @return void */
public function addParam($name, $value)
{
$this->params[$name] = $value;
}
/** 添加 cookie 请求数据 @param string Name of cookie @param string Value of cookie */
public function addCookie($name, $value)
{
$this->cookies[$name] = $value;
}
/** 是否使用 curl, 默认 true, false 为使用 socket */
public function useCurl($value = TRUE)
{
if (is_bool($value))
{
$this->useCurl = $value;
}
}
/** 是否使用 cookie , 默认为 false @param boolean Whether to use cookies or not @return void */
public function useCookie($value = FALSE)
{
$this->useCookie = $value;
}
/** 是否使用 cookie , 以供下一次请求使用 @param boolean Whether to save persistent cookies or not @return void */
public function saveCookie($value = FALSE)
{
$this->saveCookie = $value;
}
/** 是否跟随 302 跳转 @param boolean Whether to follow HTTP redirects or not */
public function followRedirects($value = TRUE)
{
$this->redirect = $value;
}
/** 获取结果集 @return string output of execution */
public function getResult()
{
return $this->result;
}
/** 获取最后一个返回的 headers 数组 */
public function getHeaders()
{
return $this->headers;
}
/** 获取请求的状态码 */
public function getStatus()
{
return $this->status;
}
/** 获取最后运行错误 */
public function getError()
{
return $this->error;
}
/** 执行一条 http get 请求 */
public function get($url, $data=array()){
return $this->execute($url, '', 'GET', $data);
}
/** 执行一条 http post 请求 */
public function post($url, $data=array()){
return $this->execute($url, '', 'POST', $data);
}
/**
* 使用当前的配置, 发送一条 HTTP 请求
*
* @param string URL of the target page (optional)
* @param string URL of the referrer page (optional)
* @param string 请求方法 (GET or POST) (optional)
* @param array 请求数据, key value 对应的数组 (optional)
* @return string 请求的结果集
*/
public function execute($target = '', $referrer = '', $method = '', $data = array())
{
// Populate the properties
$this->target = ($target) ? $target : $this->target;
$this->method = ($method) ? $method : $this->method;
$this->referrer = ($referrer) ? $referrer : $this->referrer;
// Add the new params
if (is_array($data) && count($data) > 0)
{
$this->params = array_merge($this->params, $data);
}
// Process data, if presented
if(is_array($this->params) && count($this->params) > 0)
{
// Get a blank slate
$tempString = array();
// Convert data array into a query string (ie animal=dog&sport=baseball)
foreach ($this->params as $key => $value)
{
if(strlen(trim($value))>0)
{
$tempString[] = $key . "=" . urlencode($value);
}
}
$queryString = join('&', $tempString);
}
// 如果 cURL 没有安装就使用 fscokopen 执行请求
$this->useCurl = $this->useCurl && in_array('curl', get_loaded_extensions());
// GET method configuration
if($this->method == 'GET')
{
if(isset($queryString))
{
$this->target = $this->target . "?" . $queryString;
}
}
// Parse target URL
$urlParsed = parse_url($this->target);
// Handle SSL connection request
if ($urlParsed['scheme'] == 'https')
{
$this->host = 'ssl://' . $urlParsed['host'];
$this->port = ($this->port != 0) ? $this->port : 443;
}
else
{
$this->host = $urlParsed['host'];
$this->port = ($this->port != 0) ? $this->port : 80;
}
// Finalize the target path
$this->path = (isset($urlParsed['path']) ? $urlParsed['path'] : '/') . (isset($urlParsed['query']) ? '?' . $urlParsed['query'] : '');
$this->schema = $urlParsed['scheme'];
// Pass the requred cookies
$this->_passCookies();
// Process cookies, if requested
if(is_array($this->cookies) && count($this->cookies) > 0)
{
// Get a blank slate
$tempString = array();
// Convert cookiesa array into a query string (ie animal=dog&sport=baseball)
foreach ($this->cookies as $key => $value)
{
if(strlen(trim($value)) > 0)
{
$tempString[] = $key . "=" . urlencode($value);
}
}
$cookieString = join('&', $tempString);
}
// Do we need to use cURL
if ($this->useCurl)
{
// Initialize PHP cURL handle
$ch = curl_init();
// GET method configuration
if($this->method == 'GET')
{
curl_setopt ($ch, CURLOPT_HTTPGET, TRUE);
curl_setopt ($ch, CURLOPT_POST, FALSE);
}
// POST method configuration
else
{
if(isset($queryString))
{
curl_setopt ($ch, CURLOPT_POSTFIELDS, $queryString);
}
curl_setopt ($ch, CURLOPT_POST, TRUE);
curl_setopt ($ch, CURLOPT_HTTPGET, FALSE);
}
// Basic Authentication configuration
if ($this->username && $this->password)
{
curl_setopt($ch, CURLOPT_USERPWD, $this->username . ':' . $this->password);
}
// Custom cookie configuration
if($this->useCookie && isset($cookieString))
{
curl_setopt ($ch, CURLOPT_COOKIE, $cookieString);
}
curl_setopt($ch, CURLOPT_HEADER, array('Accept-Language: zh-cn','Connection: Keep-Alive','Cache-Control: no-cache'));
curl_setopt($ch, CURLOPT_NOBODY, FALSE); // Return body
curl_setopt($ch, CURLOPT_COOKIEJAR, $this->cookiePath); // cookie 文件
curl_setopt($ch, CURLOPT_COOKIEFILE, $this->cookiePath); // cookie 文件
curl_setopt($ch, CURLOPT_TIMEOUT, $this->timeout); // Timeout
curl_setopt($ch, CURLOPT_USERAGENT, $this->userAgent); // Webbot name
curl_setopt($ch, CURLOPT_URL, $this->target); // Target site
curl_setopt($ch, CURLOPT_REFERER, $this->referrer); // Referer value
curl_setopt($ch, CURLOPT_VERBOSE, FALSE); // Minimize logs
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, FALSE); // No certificate
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, $this->redirect); // Follow redirects
curl_setopt($ch, CURLOPT_MAXREDIRS, $this->maxRedirect); // Limit redirections to four
curl_setopt($ch, CURLOPT_RETURNTRANSFER, TRUE); // 是否以 string 格式返回
// Get the target contents
$content = curl_exec($ch);
// Get the request info
$curl_info = curl_getinfo($ch);
$header_size = $curl_info["header_size"];
// 赋值结果集
$this->result = substr($content, $header_size);
$reader = explode("\r\n\r\n", trim(substr($content, 0, $header_size)));
$this->status = $curl_info['http_code'];
// Parse the headers
$this->_parseHeaders( explode("\r\n\r\n", trim(substr($content, 0, $header_size))) );
// Store the error (is any)
$this->_setError(curl_error($ch));
// Close PHP cURL handle
curl_close($ch);
}
else
{
// Get a file pointer
$filePointer = fsockopen($this->host, $this->port, $errorNumber, $errorString, $this->timeout);
// We have an error if pointer is not there
if (!$filePointer)
{
$this->_setError('Failed opening http socket connection: ' . $errorString . ' (' . $errorNumber . ')');
return FALSE;
}
// Set http headers with host, user-agent and content type
$requestHeader = $this->method . " " . $this->path . " HTTP/1.1\r\n";
$requestHeader .= "Host: " . $urlParsed['host'] . "\r\n";
$requestHeader .= "User-Agent: " . $this->userAgent . "\r\n";
$requestHeader .= "Content-Type: application/x-www-form-urlencoded\r\n";
// Specify the custom cookies
if ($this->useCookie && $cookieString != '')
{
$requestHeader.= "Cookie: " . $cookieString . "\r\n";
}
// POST method configuration
if ($this->method == "POST")
{
$requestHeader.= "Content-Length: " . strlen($queryString) . "\r\n";
}
// Specify the referrer
if ($this->referrer != '')
{
$requestHeader.= "Referer: " . $this->referrer . "\r\n";
}
// Specify http authentication (basic)
if ($this->username && $this->password)
{
$requestHeader.= "Authorization: Basic " . base64_encode($this->username . ':' . $this->password) . "\r\n";
}
$requestHeader.= "Connection: close\r\n\r\n";
// POST method configuration
if ($this->method == "POST")
{
$requestHeader .= $queryString;
}
// We're ready to launch
fwrite($filePointer, $requestHeader);
// Clean the slate
$responseHeader = '';
$responseContent = '';
// 3...2...1...Launch !
do
{
$responseHeader .= fread($filePointer, 1);
}
while (!preg_match('/\\r\\n\\r\\n$/', $responseHeader));
// Parse the headers
$this->_parseHeaders($responseHeader);
// Do we have a 301/302 redirect ?
if (($this->status == '301' || $this->status == '302') && $this->redirect == TRUE)
{
if ($this->curRedirect < $this->maxRedirect)
{
// Let's find out the new redirect URL
$newUrlParsed = parse_url($this->headers['location']);
if ($newUrlParsed['host'])
{
$newTarget = $this->headers['location'];
}
else
{
$newTarget = $this->schema . '://' . $this->host . '/' . $this->headers['location'];
}
// Reset some of the properties
$this->port = 0;
$this->status = 0;
$this->params = array();
$this->method = 'GET';
$this->referrer = $this->target;
// Increase the redirect counter
$this->curRedirect++;
// Let's go, go, go !
$this->result = $this->execute($newTarget);
}
else
{
$this->_setError('Too many redirects.');
return FALSE;
}
}
else
{
// Nope...so lets get the rest of the contents (non-chunked)
if ($this->headers['transfer-encoding'] != 'chunked')
{
while (!feof($filePointer))
{
$responseContent .= fgets($filePointer, 128);
}
}
else
{
// Get the contents (chunked)
while ($chunkLength = hexdec(fgets($filePointer)))
{
$responseContentChunk = '';
$readLength = 0;
while ($readLength < $chunkLength)
{
$responseContentChunk .= fread($filePointer, $chunkLength - $readLength);
$readLength = strlen($responseContentChunk);
}
$responseContent .= $responseContentChunk;
fgets($filePointer);
}
}
// Store the target contents
$this->result = chop($responseContent);
}
}
// There it is! We have it!! Return to base !!!
return $this->result;
}
/** 解析 header 信息*/
private function _parseHeaders($responseHeader)
{
// Break up the headers
$headers = $responseHeader;
// Clear the header array
$this->_clearHeaders();
// Get resposne status
if($this->status == 0)
{
// Oooops !
if(!eregi($match = "^http/[0-9]+\\.[0-9]+[ \t]+([0-9]+)[ \t]*(.*)\$", $headers[0], $matches))
{
$this->_setError('Unexpected HTTP response status');
return FALSE;
}
// Gotcha!
$this->status = $matches[1];
array_shift($headers);
}
// Prepare all the other headers
foreach ($headers as $header)
{
// Get name and value
$headerName = strtolower($this->_tokenize($header, ':'));
$headerValue = trim(chop($this->_tokenize("\r\n")));
// If its already there, then add as an array. Otherwise, just keep there
if(isset($this->headers[$headerName]))
{
if(gettype($this->headers[$headerName]) == "string")
{
$this->headers[$headerName] = array($this->headers[$headerName]);
}
$this->headers[$headerName][] = $headerValue;
}
else
{
$this->headers[$headerName] = $headerValue;
}
}
// Save cookies if asked
if ($this->saveCookie && isset($this->headers['set-cookie']))
{
$this->_parseCookie();
}
}
/** 去除所有 header 信息 */
private function _clearHeaders()
{
$this->headers = array();
}
/** 解析 COOKIE */
private function _parseCookie()
{
// Get the cookie header as array
if(gettype($this->headers['set-cookie']) == "array")
{
$cookieHeaders = $this->headers['set-cookie'];
}
else
{
$cookieHeaders = array($this->headers['set-cookie']);
}
// Loop through the cookies
for ($cookie = 0; $cookie < count($cookieHeaders); $cookie++)
{
$cookieName = trim($this->_tokenize($cookieHeaders[$cookie], "="));
$cookieValue = $this->_tokenize(";");
$urlParsed = parse_url($this->target);
$domain = $urlParsed['host'];
$secure = '0';
$path = "/";
$expires = "";
while(($name = trim(urldecode($this->_tokenize("=")))) != "")
{
$value = urldecode($this->_tokenize(";"));
switch($name)
{
case "path" : $path = $value; break;
case "domain" : $domain = $value; break;
case "secure" : $secure = ($value != '') ? '1' : '0'; break;
}
}
$this->_setCookie($cookieName, $cookieValue, $expires, $path , $domain, $secure);
}
}
/** 设置 cookie , 为下一次请求做准备 */
private function _setCookie($name, $value, $expires = "" , $path = "/" , $domain = "" , $secure = 0)
{
if(strlen($name) == 0)
{
return($this->_setError("No valid cookie name was specified."));
}
if(strlen($path) == 0 || strcmp($path[0], "/"))
{
return($this->_setError("$path is not a valid path for setting cookie $name."));
}
if($domain == "" || !strpos($domain, ".", $domain[0] == "." ? 1 : 0))
{
return($this->_setError("$domain is not a valid domain for setting cookie $name."));
}
$domain = strtolower($domain);
if(!strcmp($domain[0], "."))
{
$domain = substr($domain, 1);
}
$name = $this->_encodeCookie($name, true);
$value = $this->_encodeCookie($value, false);
$secure = intval($secure);
$this->_cookies[] = array( "name" => $name,
"value" => $value,
"domain" => $domain,
"path" => $path,
"expires" => $expires,
"secure" => $secure
);
}
/** cookie 数据集编码 */
private function _encodeCookie($value, $name)
{
return($name ? str_replace("=", "%25", $value) : str_replace(";", "%3B", $value));
}
/** 把正确的 cookie 传输给当前请求 */
private function _passCookies()
{
if (is_array($this->_cookies) && count($this->_cookies) > 0)
{
$urlParsed = parse_url($this->target);
$tempCookies = array();
foreach($this->_cookies as $cookie)
{
if ($this->_domainMatch($urlParsed['host'], $cookie['domain']) && (0 === strpos($urlParsed['path'], $cookie['path']))
&& (empty($cookie['secure']) || $urlParsed['protocol'] == 'https'))
{
$tempCookies[$cookie['name']][strlen($cookie['path'])] = $cookie['value'];
}
}
// cookies with longer paths go first
foreach ($tempCookies as $name => $values)
{
krsort($values);
foreach ($values as $value)
{
$this->addCookie($name, $value);
}
}
}
}
/** 匹配域名 */
private function _domainMatch($requestHost, $cookieDomain)
{
if ('.' != $cookieDomain{0})
{
return $requestHost == $cookieDomain;
}
elseif (substr_count($cookieDomain, '.') < 2)
{
return false;
}
else
{
return substr('.'. $requestHost, - strlen($cookieDomain)) == $cookieDomain;
}
}
/** 给当前操作做记号用的 */
private function _tokenize($string, $separator = '')
{
if(!strcmp($separator, ''))
{
$separator = $string;
$string = $this->nextToken;
}
for($character = 0; $character < strlen($separator); $character++)
{
if(gettype($position = strpos($string, $separator[$character])) == "integer")
{
$found = (isset($found) ? min($found, $position) : $position);
}
}
if(isset($found))
{
$this->nextToken = substr($string, $found + 1);
return(substr($string, 0, $found));
}
else
{
$this->nextToken = '';
return($string);
}
}
/** 设置错误信息 */
private function _setError($error)
{
if ($error != '')
{
$this->error = $error;
return $error;
}
}
}
?>

File diff suppressed because it is too large Load Diff