import re
import json
from urllib.request import urlopen
import ssl
# ⼲掉数字签名证书
ssl._create_default_https_context = ssl._create_unverified_context
ershoufang_url='https://bj.lianjia.com/ershoufang/rs/'
def get_html_content(url):
html=urlopen(url)
content=html.read().decode('utf-8')
# print(content)
return content
def chuli(content):
obj=re.compile(r'<span.*?>关注</span></div><div.*?><span></span></div><div.*?><span></span></div><div class="price"><span>(?P<price>.*?)</span>万</div></a><a.*?>(?P<title>.*?)</a><div class="info">.*?<span>/</span>.*?<span>/</span>(?P<pingmi>.*?)<span>/</span>(?P<fangxiang>.*?)<span>/</span>(?P<zhuangxiu>.*?)</div><div .*?>(?:<span .*?>.*?</span>)?<span.*?>(?P<fangben>.*?)</span>',re.S)
it=obj.finditer(content)
for el in it:
yield {
'价格:':el.group('price')+'万',
'房屋信息:':el.group('title'),
'平米数:':el.group('pingmi'),
'朝向':el.group('fangxiang'),
'装修:':el.group('zhuangxiu').replace('<span>/</span>',','),
'房本信息:':el.group('fangben').replace('随时看房','无信息').replace('关注','无信息'),
}
def xieru(jieguo):
txt=json.dumps(jieguo,ensure_ascii=False)
with open('houseInfo',mode='a',encoding='utf-8')as f:
f.write(txt+'\n')
def main():
for i in range(1,101):
if i ==1:
new_content = get_html_content(ershoufang_url)
else:
dong_url='https://bj.lianjia.com/ershoufang/pg%d/'%i
new_content = get_html_content(dong_url)
ret = chuli(new_content)
for el in ret:
xieru(el)
print(el)
if __name__=='__main__':
main()