1 import requests
2 from time import sleep
3 from lxml import etree
4 from fake_useragent import UserAgent
5 from random import randint
6 import re
7 from threading import Thread
8 from queue import Queue
9 from os import remove
10 from copy import copy, deepcopy
11 from bs4 import BeautifulSoup, Comment
12 from pyquery import PyQuery
13
14 session = requests.session()
15 headers = {
16 'User-Agent': UserAgent().random
17 }
18 def get_html(url):
19 sleep( randint(1,3) )
20 response = session.get(url, headers=headers)
21 if response.status_code == 200:
22 return response.text
23 else:
24 return None
25
26 def save_image(filename, url, data):
27 f1 = re.findall(r'/([^/]+\.[^/]+)\?', url)
28 if not f1:
29 f1 = re.findall(r'/([^/]+\.[^/]+)$', url)
30 url = url + '?imageView2/1/w/160/h/220'
31 if f1:
32 f1 = f1[0]
33 else:
34 f1 = ''
35
36 if (f1 == '') or (f1 == None) :
37 f1 = filename
38
39 domain = re.findall(r'^https://([^/]+.com)', url)[0]
40 url = url.replace(domain, r'img1.abcdbio.com')
41 r = session.get(url, stream=True, headers=headers)
42 if r.status_code == 200:
43 data['file_content'] = deepcopy(r.content)
44 # with open(f1, 'wb') as f:
45 # f.write(r.content)
46
47 print(f1)
48 r.close()
49
50 def parse_film_bs4(html):
51 bsoup = BeautifulSoup(html, 'lxml')
52 film_list = bsoup.select('ol', class_='grid_view')[0].select('li')
53 result_list = []
54 for film in film_list:
55 detail_url = film.select('div > a')[0].attrs['href']
56 film_name = film.select('div > a > img')[0].attrs['alt']
57 film_theme = film.select('div > a > img')[0].attrs['src']
58 film_info = film.select('div', class_='bd')[0].select('p')[1].span.string
59 result_list.append({
60 'detail_url': detail_url,
61 'film_name': film_name,
62 'film_theme': film_theme,
63 'film_info': film_info,
64 'file_content': None
65 })
66 return result_list
67
68 def parse_film_pyquery(html):
69 doc = PyQuery(html)
70 film_list = doc('ol')('.grid_view').children()
71 result_list = []
72 for index in range(0, len(film_list)):
73 film = film_list.eq(index)
74 detail_url = film('div').eq(0)('a').eq(0).attr('href')
75 film_name = film('div').eq(0)('a').eq(0)('img').eq(0).attr('alt')
76 film_theme = film('div').eq(0)('a').eq(0)('img').eq(0).attr('src')
77 film_info = film.find('div.bd p').eq(1).text()
78 result_list.append({
79 'detail_url': detail_url,
80 'film_name': film_name,
81 'film_theme': film_theme,
82 'film_info': film_info,
83 'file_content': None
84 })
85 return result_list
86
87 def parse_film_xpath(html):
88 e = etree.HTML(html)
89 film_list = e.xpath(r"//ol[@class='grid_view']/li")
90 result_list = []
91 for film in film_list:
92 detail_url = film.xpath('div//a')[0].attrib['href']
93 film_name = film.xpath('div//a/img')[0].attrib['alt']
94 film_theme = film.xpath('div//a/img')[0].attrib['src']
95 film_info = film.xpath("div//div[@class='bd']/p[2]/span")[0].xpath('string(.)')
96 result_list.append({
97 'detail_url': detail_url,
98 'film_name': film_name,
99 'film_theme': film_theme,
100 'film_info': film_info,
101 'file_content': None
102 })
103 return result_list
104
105 def save_film(data):
106 data1 = {
107 'name': data['film_name'],
108 'detail_url': data['detail_url'],
109 'info': data['film_info'],
110 }
111 files = {
112 'theme': data['file_content']
113 }
114
115 r = requests.post('http://localhost:8069/abcdb/film', data=data1, files=files)
116 r.close()
117
118 class PostToServer(Thread):
119 def __init__(self, data_queue):
120 Thread.__init__(self)
121 self.data_queue = data_queue
122
123 def run(self):
124 while True:
125 if self.data_queue.empty():
126 sleep(0.1)
127 continue
128 s = self.data_queue.get()
129 print(s)
130 save_film(s)
131
132 def main():
133 data_queue = Queue()
134 th1 = PostToServer(data_queue)
135 th1.start()
136
137 url = 'https://movie.abcdb.com/top250?start=0&filter='
138 html = get_html(url)
139 # data = parse_film_xpath(html)
140 # data = parse_film_bs4(html)
141 data = parse_film_pyquery(html)
142 for film in data[:5]:
143 save_image('theme.jpg', film['film_theme'], film)
144 data_queue.put(film)
145 th1.join()
146
147 if __name__ == '__main__':
148 main()