import ssl, re,json
from urllib.request import urlopen
# 去掉数字签名证书
ssl._create_default_https_context = ssl._create_unverified_context
def getPage(url): response = urlopen(url) return response.read().decode("utf-8")

def parsePage(d): com = re.compile(r'<!-- 热推标签、埋点 -->.*?data-is_focus="(?:1)?" data-sl="">(?P<title>.*?)</a>' r'.*?data-el="region">(?P<xiaoqu>.*?)</a>' r'.*?</span>(?P<huxing>.*?)<span' r'.*?/</span>(?P<mianji>.*?)<span' r'.*?/</span>(?P<chaoxiang>.*?)<span' r'.*?/</span>(?P<zhuangxiu>.*?)<' r'(?:span class="divide">/</span>(?P<dianti>.*?)<)?'#()?括号里的东西出现0次或1次 (?:)表示取消()权限:findall会优先把匹配结果组里内容返回,如果想要匹配结果,取消权限即可 r'.*?div class="positionInfo">(?P<flood>.*?)<span' r'.*?/</span>(?P<floodtime>.*?)<span' r'.*?target="_blank">(?P<diqu>.*?)</a>' r'.*?class="followInfo">(?P<followInfo>.*?)<span' r'.*?/</span>(?P<daikancishu>.*?)<div class="tag">' r'(?:<span class="subway">(?P<subway>.*?)</span>)?'#可有可无 r'(?:<span class=".*?">(?P<fangben>.*?)</span>)?'#可有可无 r'(?:<span class="haskey">(?P<haskey>.*?)</span>)?'#可有可无 r'.*?<div class="totalPrice"><span>(?P<totalPrice>.*?)</div>' r'.*?data-price=".*?"><span>(?P<unitPrice>.*?)</span>' ,re.S) retsult=com.finditer(d) for i in retsult: yield {"title":i.group("title"), "xiaoqu": i.group("xiaoqu"), "huxing": i.group("huxing"), "mianji": i.group("mianji"), "chaoxiang": i.group("chaoxiang"), "zhuangxiu": i.group("zhuangxiu"), "dianti": i.group("dianti"), "flood": i.group("flood"), "floodtime": i.group("floodtime"), "diqu": i.group("diqu"), "followInfo": i.group("followInfo"), "daikancishu": i.group("daikancishu"), "subway": i.group("subway"), "fangben": i.group("fangben"), "haskey": i.group("haskey"), "totalPrice": re.sub("</span>","",i.group("totalPrice")), "unitPrice": i.group("unitPrice"), } f = open("lianjia_Second-hand house_info", mode="a", encoding="utf-8") for i in range(100): if i==0: url="https://bj.lianjia.com/ershoufang/" else: url = "https://bj.lianjia.com/ershoufang/"+"pg%s" % (i+1) print(url) ret = parsePage(getPage(url)) for obj in ret: data = json.dumps(obj, ensure_ascii=False) print(data) f.write(data + "\n") f.flush() f.close()