python爬取电影网站信息并写入文件

 1 #https://www.domp4.com/list/6-1.html
 2 import requests
 3 import re
 4 from bs4 import BeautifulSoup
 5 from urllib.parse import urlparse,parse_qs
 6 import os
 7 
 8 
 9 def get_url_content(url): //获取网站的源码
10     response=requests.get(url)
11     if response.status_code==200:
12         return response.text
13     else:
14         return False
15 
16 def parse_Web_Content(content):
17     Object=BeautifulSoup(content,'html.parser')
18 
19     filmName=get_film_name(Object)
20     filmCast=get_film_cast(Object)
21     filmIntro=get_film_introduction(Object)
22     filmUrl=get_film_url(Object)
23 
24     film=[]
25     for i in range(len(filmName)):
26         indiv={
27             'fileName':filmName[i],
28             'filmCast':filmCast[i],
29             'filmIntro':filmIntro[i],
30             'filmurl':'https://www.domp4.com'+filmUrl[i]
31         }
32         film.append(indiv)
33     return film
34 
35 
36 def get_film_name(Soup):
37     Name=Soup.select(".play_info")
38     name_list=[]
39     for i in range(len(Name)):
40         parsedName=Name[i].a.string
41         name_list.append(parsedName)
42     return name_list
43 
44 def get_film_cast(Soup):
45     Cast=Soup.find_all('p',attrs={'class':'space'})
46     film_Cast = []
47     for i in range(len(Cast)):
48         parsedCast=Cast[i].text
49         film_Cast.append(parsedCast)
50     return film_Cast
51 
52 def get_film_introduction(Soup):
53     Introduction=Soup.find_all('p',attrs={'class':'content'})
54     intro_list=[]
55     for i in range(len(Introduction)):
56         parsedIntro=Introduction[i].text
57         intro_list.append(parsedIntro)
58     return intro_list
59 
60 def get_film_url(Soup):
61 
62     filmUrl=Soup.select(".play_info")
63     Url_list=[]
64     for i in range(len(filmUrl)):
65         href=filmUrl[i].a['href']
66         Url_list.append(href)
67     return Url_list
68 
69 def writeTofile(parsedWebcontent):
70     with open('film.txt','a',encoding='utf-8') as f:
71         for i in range(len(parsedWebcontent)):
72             f.write(parsedWebcontent[i]['fileName']+'\t')
73             f.write(parsedWebcontent[i]['filmCast'] + '\t')
74             f.write(parsedWebcontent[i]['filmIntro'] + '\t')
75             f.write(parsedWebcontent[i]['filmurl'] + '\t')
76             f.write('\n')
77         f.close()
78 
79 
80 link="https://www.domp4.com/list/6-"
81 for i in range(1,4):
82     url=link + str(i) + ".html"
83     webContent=get_url_content(url)
84 
85     if webContent!=False:
86         Content=parse_Web_Content(webContent)
87         writeTofile(Content)

 

posted @ 2019-04-24 22:07  kevin162726  阅读(482)  评论(0编辑  收藏  举报