关于网页抓取Beautiful Soup是个好东西,不过现在还没有试过。
#获得所有链接
from HTMLParser import HTMLParser
from urllib import urlopen
class MyHTMLParser(HTMLParser):
def __init__(self):
HTMLParser.__init__(self)
self.links = []
def handle_starttag(self, tag, attrs):
if tag == "img":
if len(attrs) == 0: pass
else:
for (variable, value) in attrs:
if variable == "href":
self.links.append(value)
if variable == "src":
self.links.append(value)
if __name__ == "__main__":
html_code = urlopen('http://www.baidu.com').read()
hp = MyHTMLParser()
hp.feed(html_code)
hp.close()
print(hp.links)
关于这段代码直接取的是某个标记的属性,但是你连属性之间的文本也想取到了,就可以用到HTMLParser类里面的handle_data,但是这个取文本是随机取的,所以就需要一个标识,什么东西该去,什么东西不该去。类式这段代码
from urllib import urlopen
from HTMLParser import HTMLParser
class Scraper(HTMLParser):
in_h3 = False
in_link = False
def handle_starttag(self,tag,attrs):
attrs = dict(attrs)
if tag == 'h2':
self.in_h3 = True
if tag == 'a' and 'href' in attrs:
self.in_link = True
self.chunks = []
self.url = attrs['href']
def handle_data(self,data):
if self.in_link:
self.chunks.append(data)
def handle_endtag(self,tag):
if tag == 'h2':
self.in_h3 = False
if tag == 'a':
if self.in_h3 and self.in_link:
print '%s(%s)' %(''.join(self.chunks),self.url)
self.in_link = False
text = urlopen('http://www.python.org/community/jobs').read()
parser = Scraper()
parser.feed(text)
parser.close()
好吧,在这里在记两个blog
http://blog.csdn.net/bestdowt1314/archive/2011/01/13/6134960.aspx
http://spark.wikidot.com/blog:python
http://www.ibm.com/developerworks/cn/linux/l-python-mechanize-beautiful-soup/
给了提示我,谢谢!