import requests,re,json,time
from requests.exceptions import RequestException
headers={
'User-Agent':'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
}
def get_one_page(url):
r=requests.get(url,headers=headers)
if r.status_code==200:
return r.text
return None
def parse_one_page(html):
pattern = re.compile(
'<dd>.*?board-index.*?>(\d+)</i>.*?data-src="(.*?)".*?name"><a'
+ '.*?>(.*?)</a>.*?star">(.*?)</p>.*?releasetime">(.*?)</p>'
+ '.*?integer">(.*?)</i>.*?fraction">(.*?)</i>.*?</dd>', re.S)
items=re.findall(pattern,html)
for item in items:
yield{
'index': item[0],
'image': item[1],
'title': item[2].strip(),
'actor': item[3].strip()[3:] if len(item[3]) > 3 else '',
'time': item[4].strip()[5:] if len(item[4]) > 5 else '',
'score': item[5].strip() + item[6].strip()
}
def write_to_file(content):
with open('result.txt','a') as f:
f.write(json.dumps(content,ensure_ascii=False)+'\n')
def main(offset):
url='https://maoyan.com/board/4?offset='+str(offset)
html=get_one_page(url)
for item in parse_one_page(html):
print(item)
write_to_file(item)
if __name__=='__main__':
for i in range(10):
main(offset=i*10)
time.sleep(1)