<?xml version="1.0" encoding="UTF-8"?>
<rss version="2.0"
	xmlns:content="http://purl.org/rss/1.0/modules/content/"
	xmlns:wfw="http://wellformedweb.org/CommentAPI/"
	xmlns:dc="http://purl.org/dc/elements/1.1/"
	xmlns:atom="http://www.w3.org/2005/Atom"
	xmlns:sy="http://purl.org/rss/1.0/modules/syndication/"
	xmlns:slash="http://purl.org/rss/1.0/modules/slash/"
	>

<channel>
	<title>ls /proc &#187; spider</title>
	<atom:link href="http://www.lsproc.com/blog/tag/spider/feed/" rel="self" type="application/rss+xml" />
	<link>http://www.lsproc.com/blog</link>
	<description>lsproc.com</description>
	<lastBuildDate>Fri, 18 Nov 2011 09:22:52 +0000</lastBuildDate>
	<language>en</language>
	<sy:updatePeriod>hourly</sy:updatePeriod>
	<sy:updateFrequency>1</sy:updateFrequency>
	<generator>http://wordpress.org/?v=3.3.1</generator>
		<item>
		<title>python 抓取页面</title>
		<link>http://www.lsproc.com/blog/python_spider/</link>
		<comments>http://www.lsproc.com/blog/python_spider/#comments</comments>
		<pubDate>Mon, 01 Feb 2010 06:34:31 +0000</pubDate>
		<dc:creator>lostsnow</dc:creator>
				<category><![CDATA[Program&Database]]></category>
		<category><![CDATA[python]]></category>
		<category><![CDATA[spider]]></category>

		<guid isPermaLink="false">http://www.lsproc.com/blog/?p=363</guid>
		<description><![CDATA[转载时请标明文章原始出处和作者信息, 作者: lostsnow.http://www.lsproc.com/blog/python_spider/ #coding=utf-8 import sys import urllib2 import gzip import StringIO # 页面url url = &#34;http://china.toocle.com/company/show/pdetail--1000436--10532651.html&#34; # 页面编码 page_encode = &#34;gbk&#34; request = urllib2.Request(url) request.add_header(&#34;Accept-encoding&#34;, &#34;gzip&#34;) usock = urllib2.urlopen(request) page = usock.read() # 处理gzip过的页面 if usock.headers.get(&#039;content-encoding&#039;, None) == &#039;gzip&#039;: &#8230; <a href="http://www.lsproc.com/blog/python_spider/">Continue reading <span class="meta-nav">&#8594;</span></a>]]></description>
			<content:encoded><![CDATA[<p>转载时请标明文章原始出处和作者信息, 作者: <a href="http://www.lsproc.com/blog/">lostsnow</a>.<br /><a href="http://www.lsproc.com/blog/python_spider/">http://www.lsproc.com/blog/python_spider/</a></p>
<pre class="brush: python">
#coding=utf-8

import sys
import urllib2
import gzip
import StringIO

# 页面url
url = &quot;http://china.toocle.com/company/show/pdetail--1000436--10532651.html&quot;
# 页面编码
page_encode = &quot;gbk&quot;

request = urllib2.Request(url)
request.add_header(&quot;Accept-encoding&quot;, &quot;gzip&quot;)
usock = urllib2.urlopen(request)
page = usock.read()
# 处理gzip过的页面
if usock.headers.get(&#039;content-encoding&#039;, None) == &#039;gzip&#039;:
    page = gzip.GzipFile(fileobj=StringIO.StringIO(page)).read()

# 转unicode(gbk/utf8)
if not isinstance(page, unicode):
    page = unicode(page, page_encode)

print(page)
</pre>
<p>-- EOF --</p>
<h2  class="related_post_title">Related Posts</h2><ul class="related_post"><li>2008-11-10 -- <a href="http://www.lsproc.com/blog/python_pil/" title="python图形处理库Python Imaging Library (PIL)">python图形处理库Python Imaging Library (PIL)</a> (0)</li><li>2008-11-05 -- <a href="http://www.lsproc.com/blog/configure_python_and_django_on_dreamhost/" title="Dreamhost 上编译python并安装django">Dreamhost 上编译python并安装django</a> (7)</li></ul>]]></content:encoded>
			<wfw:commentRss>http://www.lsproc.com/blog/python_spider/feed/</wfw:commentRss>
		<slash:comments>0</slash:comments>
		</item>
	</channel>
</rss>

