import requests
import re
import pandas as pd
host = r'https://am.22.cn'
url = 'https://am.22.cn/wsp/History/Detail/'
mainpatt = re.compile(r'tbody[\s\S]+?</tbody>')
rowpatt = re.compile(r'<tr>[\S\s]+?</tr>')
cellpatt = re.compile(r'<td[\s\S]+?>([\s\S]+?)</td>')
domainurlpatt = re.compile(r'href="([\s\S]+?)"')
domainpatt = re.compile(r'_blank">([\s\S]+?)</a>')
titlepatt = re.compile(r'<title>([\s\S]+?)</title>')
def getinner(url):
bb = requests.get(url).content.decode('utf8')
c = titlepatt.findall(bb)[0]
return c
def getsingle(url):
aa = requests.get(url=url).content.decode('utf8')
main = mainpatt.findall(aa)[0]
rows = rowpatt.findall(main)
aa = []
for each in rows:
cells = cellpatt.findall(each)
domainstr = cells[1]
price = cells[3].strip().replace('<td>', '').replace('¥', '')
status = cells[4].strip().replace('</font>', '')
domainurl = domainurlpatt.findall(domainstr)[0].strip()
domain = domainpatt.findall(domainstr)[0].strip()
if domain.find("打包") >=0:
tmpurl = host + domainurl
domain = getinner(tmpurl)
d = {
'domain': domain, 'price': price, 'status': status
}
aa.append(d)
return aa
if __name__ == '__main__':
r = []
for i in range(1000, 2232):
turl = r'https://am.22.cn/wsp/History/Detail/' + str(i)
a = getsingle(turl)
r = r + a
print(i, 'done')
df = pd.DataFrame(r)
df.to_excel('1.xlsx', index=False)