1 #https://www.domp4.com/list/6-1.html
2 import requests
3 import re
4 from bs4 import BeautifulSoup
5 from urllib.parse import urlparse,parse_qs
6 import os
7
8
9 def get_url_content(url): //获取网站的源码
10 response=requests.get(url)
11 if response.status_code==200:
12 return response.text
13 else:
14 return False
15
16 def parse_Web_Content(content):
17 Object=BeautifulSoup(content,'html.parser')
18
19 filmName=get_film_name(Object)
20 filmCast=get_film_cast(Object)
21 filmIntro=get_film_introduction(Object)
22 filmUrl=get_film_url(Object)
23
24 film=[]
25 for i in range(len(filmName)):
26 indiv={
27 'fileName':filmName[i],
28 'filmCast':filmCast[i],
29 'filmIntro':filmIntro[i],
30 'filmurl':'https://www.domp4.com'+filmUrl[i]
31 }
32 film.append(indiv)
33 return film
34
35
36 def get_film_name(Soup):
37 Name=Soup.select(".play_info")
38 name_list=[]
39 for i in range(len(Name)):
40 parsedName=Name[i].a.string
41 name_list.append(parsedName)
42 return name_list
43
44 def get_film_cast(Soup):
45 Cast=Soup.find_all('p',attrs={'class':'space'})
46 film_Cast = []
47 for i in range(len(Cast)):
48 parsedCast=Cast[i].text
49 film_Cast.append(parsedCast)
50 return film_Cast
51
52 def get_film_introduction(Soup):
53 Introduction=Soup.find_all('p',attrs={'class':'content'})
54 intro_list=[]
55 for i in range(len(Introduction)):
56 parsedIntro=Introduction[i].text
57 intro_list.append(parsedIntro)
58 return intro_list
59
60 def get_film_url(Soup):
61
62 filmUrl=Soup.select(".play_info")
63 Url_list=[]
64 for i in range(len(filmUrl)):
65 href=filmUrl[i].a['href']
66 Url_list.append(href)
67 return Url_list
68
69 def writeTofile(parsedWebcontent):
70 with open('film.txt','a',encoding='utf-8') as f:
71 for i in range(len(parsedWebcontent)):
72 f.write(parsedWebcontent[i]['fileName']+'\t')
73 f.write(parsedWebcontent[i]['filmCast'] + '\t')
74 f.write(parsedWebcontent[i]['filmIntro'] + '\t')
75 f.write(parsedWebcontent[i]['filmurl'] + '\t')
76 f.write('\n')
77 f.close()
78
79
80 link="https://www.domp4.com/list/6-"
81 for i in range(1,4):
82 url=link + str(i) + ".html"
83 webContent=get_url_content(url)
84
85 if webContent!=False:
86 Content=parse_Web_Content(webContent)
87 writeTofile(Content)