最终更新:看起来目标网站拦截了 DO 的 IP 地址,导致我连续几天都在解决问题。我启动了一个 EC2 实例,并管理好了代码,加入了缓存等功能,以减少对网站的访问压力,允许我的用户共享该网站。
-
更新:我将 curl 的错误设置为关闭后,成功获取了 HTML,但是除了返回 405 错误之外,该网站还未设置一些必须的 cookie 才能加载网站内容。
下面是我用于 ajax->PHP 从网站检索 og: meta 的代码。然而,有 1 或 2 个特定的网站会返回错误并无法检索信息。对于大多数网站,该代码都可以无缝运行。
警告:DOMDocument::loadHTML():在/my/home/path/getUrlMeta.php的第58行中没有提供输入字符串。
从我的 error_log 中的 curl_error 中:
请求的 URL 返回错误:405 Not Allowed
和
无法连接到 www.something.com 端口 443:拒绝连接
当我在服务器控制台上使用 curl 时,我没有任何问题获取网站的 HTML,并且使用以下代码检索大多数网站所需的信息也没有问题。
function file_get_contents_curl($url)
{
$ch = curl_init();
$header[0] = "Accept: text/html, text/xml,application/xml,application/xhtml+xml,";
$header[0] .= "text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5";
$header[] = "Cache-Control: max-age=0";
$header[] = "Connection: keep-alive";
$header[] = "Keep-Alive: 300";
$header[] = "Accept-Charset: ISO-8859-1,utf-8;q=0.7,*;q=0.7";
$header[] = "Accept-Language: en-us,en;q=0.5";
$header[] = "Pragma: no-cache";
curl_setopt($ch, CURLOPT_HTTPHEADER, $header);
curl_setopt($ch, CURLOPT_HEADER, 0);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1);
//curl_setopt($ch, CURLOPT_CUSTOMREQUEST, 'GET');
curl_setopt($ch, CURLOPT_FAILONERROR, true);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
curl_setopt($ch, CURLOPT_TIMEOUT, 30);
curl_setopt($ch, CURLOPT_USERAGENT,"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:31.0) Gecko/20100101 Firefox/31.0 " );
curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, false);
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false);
//The following 2 set up lines work with sites like www.nytimes.com
//Update: Added option for cookie jar since some websites recommended it. cookies.txt is set to permission 777. Still doesn't work.
$cookiefile = '/home/my/folder/cookies.txt';
curl_setopt( $ch, CURLOPT_COOKIESESSION, true );
curl_setopt( $ch, CURLOPT_COOKIEJAR, $cookiefile );
curl_setopt( $ch, CURLOPT_COOKIEFILE, $cookiefile );
$data = curl_exec($ch);
if(curl_error($ch))
{
error_log(curl_error($ch));
}
curl_close($ch);
return $data;
}
$html = file_get_contents_curl($url);
libxml_use_internal_errors(true); // Yeah if you are so worried about using @ with warnings
$doc = new DomDocument();
$doc->loadHTML($html);
$xpath = new DOMXPath($doc);
$query = '//*/meta[starts-with(@property, \'og:\')]';
$metas = $xpath->query($query);
$rmetas = array();
foreach ($metas as $meta) {
$property = substr($meta->getAttribute('property'),3);
$content = $meta->getAttribute('content');
$rmetas[$property] = $content;
}
/*below code retrieves the next bigger than 600px image should og:image be empty.*/
if (empty($rmetas['image'])) {
//$src = $xpath->evaluate("string(//img/@src)");
//echo "src=" . $src . "\n";
$query = '//*/img';
$srcs = $xpath->query($query);
foreach ($srcs as $src) {
$property = $src->getAttribute('src');
if (substr($property,0,4) == 'http' && in_array(substr($property,-3), array('jpg','png','peg'), true)) {
if (list($width, $height) = getimagesize($property)) {
do if ($width > 600) {
$rmetas['image'] = $property;
break;
} while (0);
}
}
}
}
echo json_encode($rmetas);
die();
更新:我犯了错误,该网站未启用https,因此我仍然遇到405不允许的错误。
curl信息
{
"url": "http://www.example.com/",
"content_type": null,
"http_code": 405,
"header_size": 0,
"request_size": 458,
"filetime": -1,
"ssl_verify_result": 0,
"redirect_count": 0,
"total_time": 0.326782,
"namelookup_time": 0.004364,
"connect_time": 0.007725,
"pretransfer_time": 0.007867,
"size_upload": 0,
"size_download": 0,
"speed_download": 0,
"speed_upload": 0,
"download_content_length": -1,
"upload_content_length": -1,
"starttransfer_time": 0.326634,
"redirect_time": 0,
"redirect_url": "",
"primary_ip": "SOME IP",
"certinfo": [],
"primary_port": 80,
"local_ip": "SOME IP",
"local_port": 52966
}
更新:如果我从控制台执行curl -i,我会得到以下响应。一个错误的405,但它跟随着我需要的所有HTML。
Home> curl -i http://www.domain.com
HTTP/1.1 405 Not Allowed
Server: nginx
Date: Wed, 22 Feb 2017 17:57:03 GMT
Content-Type: text/html; charset=UTF-8
Transfer-Encoding: chunked
Vary: Accept-Encoding
Vary: Accept-Encoding
Set-Cookie: PHPSESSID2=ko67tfga36gpvrkk0rtqga4g94; path=/; domain=.domain.com
Expires: Thu, 19 Nov 1981 08:52:00 GMT
Cache-Control: no-store, no-cache, must-revalidate, post-check=0, pre-check=0
Pragma: no-cache
Set-Cookie: __PAGE_REFERRER=deleted; expires=Thu, 01-Jan-1970 00:00:01 GMT; Max-Age=0; path=/; domain=www.domain.com
Set-Cookie: __PAGE_SITE_REFERRER=deleted; expires=Thu, 01-Jan-1970 00:00:01 GMT; Max-Age=0; path=/; domain=www.domain.com
X-Repository: legacy
X-App-Server: production-web23:8018
X-App-Server: distil2-kvm:80
CURLOPT_FAILONERROR
,你将会得到完整的 405 内容,就像你展示的命令行等效方式一样。 - Daniel Stenberg