import urllib.request
import urllib.parse
from bs4 import BeautifulSoup
import json
class ZhiLianSpider(object):
url = "https://sou.zhaopin.com/?"
def __init__(self, jl, kw, start_page, end_page):
self.jl = jl
self.kw = kw
self.start_page = start_page
self.end_page = end_page
self.items = [] # 定义一个空列表,存放所有的工作信息
# 解析html文件,提取所需的内容
def parse_content(self, content):
soup = BeautifulSoup(content, 'html.parser')
table_list = soup.select('#listContent > table')[1:]
for table in table_list:
zwmc = table.select('.zwmc > div > a')[0].text
gsmc = table.select('.gsmc > a')[0].text
zwyx = table.select('.zwyx')[0].text
gzdd = table.select('.gzdd')[0].text
gxsj = table.select('.gxsj > span')[0].text
item = {
'职位名称': zwmc,
'公司名称': gsmc,
'职位月薪': zwyx,
'工作地点': gzdd,
'更新时间': gxsj,
}
self.items.append(item)
# 启动爬虫
def run(self):
for page in range(self.start_page, self.end_page+1):
request = self.handler_request(page) # 构建request对象
content = urllib.request.urlopen(request).read().decode() # 发起get请求,获得html文件
self.parse_content(content)
string_items = json.dumps(self.items, ensure_ascii=False) # 将列表类型转化为字符串类型
with open("zhilian.txt", "w", encoding="utf-8") as f: # 设置ensure_ascii,打开txt文件时显示中文
f.write(string_items)
def handler_request(self, page): # 处理url,构建request对象
data = {
'jl': self.jl,
'kw': self.kw,
'p': page
}
get_url = ZhiLianSpider.url + urllib.parse.urlencode(data) # url中有中文,需要urlencode编码
# print(get_url)
headers = {
"user-agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) Apple\
WebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36"
}
request = urllib.request.Request(url=get_url, headers=headers)
return request
def main():
jl = input("请输入工作地点:")
kw = input("请输入工作关键词:")
start_page = int(input("请输入查询起始页面:"))
end_page = int(input("查询结束页面:"))
# 创建对象,启动爬取程序
spider = ZhiLianSpider(jl, kw, start_page, end_page)
spider.run()
if __name__ == '__main__':
main()