爬虫入门------beautifulsoup4.0
以通过搜狗抓取微信公众号文章为例。
先来一段示例:
import re
import time
import urllib2
import sys
import os
from bs4 import BeautifulSoup
class parseHtml:
filename = ''
filestr = ''
filesoup = ''
def __init__(self, param):
if (os.path.isfile(param) is False):
print "[ERR]Not Found File" + param
sys.exit(1)
self.filename = param
self.filestr = open(self.filename, 'r')
self.filesoup = BeautifulSoup(self.filestr)
def getRecordTitleAndName(self):
title = self.filesoup.h2.get_text()
titlename = title.strip()
webchat_name = self.filesoup.find(id='post-user').get_text()
return [titlename, webchat_name]
def getWechatNumberAndName(self):
name = self.filesoup.find("strong", {"class": "profile_nickname"}).string.strip()
number = self.filesoup.find("p", {"class": "profile_account"}).get_text().split()
return [name, number[1]]
def getRecordList(self):
for x in self.filesoup.find_all('script'):
item = x.get_text()
if re.search('{.*list.*}', item) is not None:
data_json_str = '{' + re.search("{.*list.*}", item).group(0) + '}'
temp = data_json_str.replace(' ', ' ').replace('&', '&')
return data_json_str[1:-1].encode("utf-8")
return False
def getNumberSourceUrl(self):
if (self.filesoup.find(uigs='account_name_0') is None):
return False
account = self.filesoup.find(uigs='account_name_0').get('href')
return account
request_type = sys.argv[1]
request_param = sys.argv[2]
if (request_type is None or request_param is None):
print "params is illegal.[request_type]" + request_type + "[request_param]" + request_param
sys.exit(1)
if (request_type == '1'):
request = parseHtml(request_param)
url = request.getNumberSourceUrl()
if (url is False):
print "[ERR][step1]file is not available " + request_param
sys.exit(1)
else:
print url.encode("utf-8")
sys.exit(0)
elif (request_type == '2'):
request = parseHtml(request_param)
record_list = request.getRecordList()
numberName = request.getWechatNumberAndName()
if (record_list is False):
print "[ERR][step2]file is not available " + request_param
else:
print numberName[0].encode("utf-8")
print numberName[1].encode("utf-8")
print record_list
sys.exit(0)
elif (request_type == '3'):
request = parseHtml(request_param)
titleAndName = request.getRecordTitleAndName()
record_content = request.getRecordContent()
if (record_content is False):
print "[ERR][step3]file is not available " + request_param
sys.exit(1)
else:
print titleAndName[0].encode("utf-8")
print titleAndName[1].encode("utf-8")
print record_content.encode("utf-8")
sys.exit(0)
代码是根据已下载待本地的文件进行dom解析的过程。
urllib2可以非常好的根据url去获取页面。
浙公网安备 33010602011771号