1 import requests
2 from requests.exceptions import RequestException
3 from pyquery import PyQuery as pq
4 from bs4 import BeautifulSoup
5 import pymongo
6 from config import *
7 from multiprocessing import Pool
8 import time
9
10 client = pymongo.MongoClient(MONGO_URL) # 申明连接对象
11 db = client[MONGO_DB] # 申明数据库
12
13 def get_one_page_html(url): # 获取网站每一页的html
14 headers = {
15 "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
16 "Chrome/85.0.4183.121 Safari/537.36"
17 }
18 try:
19 response = requests.get(url, headers=headers)
20 if response.status_code == 200:
21 return response.text
22 else:
23 return None
24 except RequestException:
25 return None
26
27
28 def get_room_url(html): # 获取当前页面上所有room_info的url
29 soup = BeautifulSoup(html, 'lxml')
30 addresses = soup.find_all('p', {'class': 'content__list--item--des'})
31 doc = pq(html)
32 room_urls = doc('.content__list--item--main .content__list--item--title a').items()
33 return room_urls, addresses
34
35
36 def parser_room_page(room_html, address_queue1, address_queue2, address_queue3): # 对租房详情页面进行解析,获取特定信息
37 soup = BeautifulSoup(room_html, 'lxml')
38 pinpai = soup.find('p', {'class': 'content__aside__list--subtitle oneline'}).text.strip().split(' ')[0]
39 price = soup.find_all('li', {'class': 'table_col'})
40 zujin = price[6].text # 租金
41 yajin = price[7].text # 押金
42 fuwufei = price[8].text # 服务费
43 zhongjiefei = price[9].text # 中介费
44 house_type = soup.find('ul', {'class': 'content__aside__list'}).find_all('li')[1].text[5:11] # 户型
45 x = soup.find_all('li', {'class': 'fl oneline'})
46 area = x[1].text[3:] # 面积
47 floor = x[7].text[3:] # 楼层
48 direction = x[2].text[3:] # 朝向
49 elevator = x[8].text[3:] # 有无电梯
50 carport = x[10].text[3:] # 有无车位
51 tenancy = x[18].text[3:] # 租期
52 maintenance = x[4].text[3:] # 维护日期
53 kanfang = x[21].text[3:] # 看房是否要预约
54 tags = soup.find('p', {'class': 'content__aside--tags'}).get_text().replace('\n', '') # 标签
55
56 yield {
57 'pinpai': pinpai,
58 'zujin': zujin,
59 'yajin': yajin,
60 'fuwufei': fuwufei,
61 'zhongjiefei': zhongjiefei,
62 'house_type': house_type,
63 'area': area,
64 'floor': floor,
65 'direction': direction,
66 'elevator': elevator,
67 'carport': carport,
68 'tenancy': tenancy,
69 'maintenance': maintenance,
70 'kanfang': kanfang,
71 'location1': address_queue1.pop(),
72 'location2': address_queue2.pop(),
73 'location3': address_queue3.pop(),
74 'tags': tags,
75 }
76
77
78 def save_to_mongo(result):
79 if db[MONGO_TABLE].insert_one(result):
80 print('存储到mongodb成功', result)
81 return True
82 return False
83
84
85 def main(page):
86 url = 'http://sz.xxxxx.com/zufang/pg' + str(page) + 'rt200600000002/#contentList'
87 html = get_one_page_html(url)
88 room_urls, addresses = get_room_url(html)
89 address_queue1 = [] # 采用队列数据结构,先进先出,用来存放租房区域(南山区、福田区等)
90 address_queue2 = []
91 address_queue3 = [] # 采用队列数据结构,先进先出,用来存放租房具体小区
92 for address in addresses:
93 temp = address.find_all('a')
94 address_queue1.insert(0, temp[0].text)
95 address_queue2.insert(0, temp[1].text)
96 address_queue3.insert(0, temp[2].text)
97 for room_url in room_urls:
98 room_url_href = room_url.attr('href')
99 room_url_href = 'http://sz.xxxxx.com/' + room_url_href
100 room_html = get_one_page_html(room_url_href)
101 if room_html is None: # 非常重要,否则room_html为None时会报错
102 pass
103 else:
104 # parser_room_page(room_html, address_queue1, address_queue2, address_queue3)
105 results = parser_room_page(room_html, address_queue1, address_queue2, address_queue3)
106 for result in results:
107 save_to_mongo(result)
108
109 if __name__ == '__main__':
110 time1 = time.time()
111 pool = Pool() # 使用多进程提高爬取效率
112 pool.map(main, [i for i in range(1, 101)])
113 time2 = time.time()
114 print(time2 - time1) # 耗时