Home | 简体中文 | 繁体中文 | 杂文 | Search | ITEYE 博客 | OSChina 博客 | Facebook | Linkedin | 作品与服务 | Email

3.15. XML

3.15.1. DOM

3.15.1.1. loadHTML

		
$html =  curl($url);
libxml_use_internal_errors(true);

$dom = new DOMDocument();
$dom->loadHTML($html);		
		
		

3.15.1.2. XPath

3.15.1.2.1. evaluate
			
<?php
$doc = new DOMDocument();
$doc->loadHTMLFile('http://netkiller.github.io/');
 
$xpath = new DOMXPath($doc);
$title = $xpath->evaluate('string(/html/head/title)');
 
echo "Document title is: " . $title . "\n";
?>		
			
			
3.15.1.2.2. query
			

 <?php 
 function curl($url, $fields = array(), $auth = false){
    	$url_arr = parse_url($url);
    	$curl = curl_init($url);
    	$headers = array(
    			'Accept: text/plain, */*; q=0.01',
    			'Accept-Encoding: gzip, deflate',
    			'Accept-Language: zh-CN,zh;q=0.8,en;q=0.6,vi;q=0.4,zh-TW;q=0.2',
    			'Connection: keep-alive',
    			'Content-Type: application/x-www-form-urlencoded; charset=UTF-8',
    	);
    	$headers[]= 'Host: '.$url_arr['host'];
    	$headers[]= 'Origin: https://'.$url_arr['host'];
    	$headers[]= 'X-Requested-With: XMLHttpRequest';
    	
//    	curl_setopt($curl, CURLOPT_HTTPHEADER, $headers);
    	curl_setopt($curl, CURLOPT_USERAGENT, "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.1) Gecko/20061204 Firefox/2.0.0.1");
    	curl_setopt($curl, CURLOPT_RETURNTRANSFER, 1);
    	curl_setopt($curl, CURLOPT_VERBOSE, 0);
    	curl_setopt($curl, CURLOPT_HEADER, 0);
    	curl_setopt($curl, CURLOPT_REFERER,   $url) ;
    	curl_setopt($curl, CURLOPT_SSL_VERIFYPEER, false);
//    	curl_setopt($curl, CURLOPT_COOKIEFILE, $jar);
//    	curl_setopt($curl, CURLOPT_COOKIEJAR, $jar);
    	 
//    	if($auth){
//    		curl_setopt($curl, CURLOPT_USERPWD, "$auth");
//    		curl_setopt($curl, CURLOPT_HTTPAUTH, CURLAUTH_BASIC);
//    	}
    
    	if($fields){
    		$fields_string = http_build_query($fields);
    		curl_setopt($curl, CURLOPT_POST, true);
    		curl_setopt($curl, CURLOPT_BINARYTRANSFER, true);
    		curl_setopt($curl, CURLOPT_POSTFIELDS, $fields_string);
    	}
    	$response = curl_exec($curl);
    	curl_close($curl);
//    	$this->referer = $url;
    	return $response;
    }
$url = "http://netkiller.github.io/journal/index.html";
$html =  curl($url);
libxml_use_internal_errors(true);

$dom = new DOMDocument();
$dom->loadHTML($html);
$xpath = new DOMXPath($dom);

$xml = $xpath->query('//div[@class="section"]');

foreach ($xml as $result_object){
	//echo $result_object->childNodes->item(0)->nodeValue;
	print_r($result_object);
}				
			
			

3.15.1.3. saveHTML

下面的例子是从某个网站扣取一个HTML块的例子

		

$url = "http://netkiller.github.io/journal/index.html";
$html =  curl($url);
libxml_use_internal_errors(true);

$dom = new DOMDocument();
$dom->loadHTML($html);
$xpath = new DOMXPath($dom);

$xml = $xpath->query('//div[@class="section"]');

$xhtml = $dom->saveHTML($xml->item(0));

print_r($xhtml);
		
		
comments powered by Disqus