python3----练习题(爬取电影天堂资源,大学排名,淘宝商品比价)

 1 import requests
 2 import re
 3 
 4 url = 'http://www.ygdy8.net/html/gndy/dyzz/list_23_{}.html'
 5 for n in range(1, 2):
 6     new_url = url.format(n)
 7     html_1 = requests.get(new_url)
 8     html_1.encoding = 'gb2312'
 9     detil_list = re.findall('<a href="(.*?)" class="ulink">', html_1.text)
10 
11     for m in detil_list:
12         b_url = 'http://www.ygdy8.net' + m
13         html_2 = requests.get(b_url)
14         html_2.encoding = 'gb2312'
15         ftp = re.findall('<a href="(.*?)">.*?</a></td>', html_2.text)
16         with open('tddy.txt', 'a', encoding='utf-8') as f:
17             f.write(ftp[0] + '\n')

 

大学排名练习

 1 import bs4
 2 import requests
 3 from bs4 import BeautifulSoup
 4 
 5 def get_html_text(url):
 6     try:
 7         r = requests.get(url, timeout=20)
 8         r.raise_for_status()
 9         r.encoding = r.apparent_encoding
10         return r.text
11     except:
12         return " "
13 
14 
15 def fill_univ_list(ulist, html):
16     soup = BeautifulSoup(html, "html.parser")
17     for tr in soup.find('tbody').children:
18         if isinstance(tr, bs4.element.Tag):   # 判断类型
19             tds = tr('td')
20             ulist.append([tds[0].string, tds[1].string, tds[3].string])
21 
22 
23 def print_univ_list(ulist, num):
24     tplt = "{0:^10}\t{1:{3}^10}\t{2:^10}"
25     print(tplt.format("排名", "学校名称", "总分", chr(12288)))
26     for i in range(num):
27         u = ulist[i]
28         print(tplt.format(u[0], u[1], u[2], chr(12288)))
29 
30 
31 def main():
32     uinfo = []
33     url = 'http://www.zuihaodaxue.com/zuihaodaxuepaiming2016.html'
34     html = get_html_text(url)
35     fill_univ_list(uinfo, html)
36     print_univ_list(uinfo, 20)
37 
38 
39 main()

 淘宝商品比价:

 1 import requests
 2 import re
 3 
 4 def get_html_text(url):
 5     try:
 6         r = requests.get(url, timeout=30)
 7         r.raise_for_status()
 8         r.encoding = 'utf-8'
 9         return r.text
10     except:
11         return ""
12 
13 
14 def parse_page(ilt, html):
15     try:
16         plt = re.findall(r'\"view_price\"\:\"[\d\.]*\"', html)
17         tlt = re.findall(r'\"raw_title\"\:\".*?\"', html)
18         for i in range(len(plt)):
19             price = eval(plt[i].split(':')[1])
20             title = eval(tlt[i].split(':')[1])
21             ilt.append([price, title])
22     except:
23         print("")
24 
25 def print_goods_list(ilt):
26     tplt = "{:4}\t{:8}\t{:16}"
27     print(tplt.format("序号", "价格", "商品名称"))
28     count = 0
29     for g in ilt:
30         count = count + 1
31         print(tplt.format(count, g[0], g[1]))
32 
33 def main():
34     goods = '减肥餐'
35     depth = 2
36     start_url = 'http://s.taobao.com/search?q=' + goods
37     info_list = []
38     for i in range(depth):
39         try:
40             url = start_url + '&s=' + str(44*i)
41             html = get_html_text(url)
42             parse_page(info_list, html)
43         except:
44             continue
45     print_goods_list(info_list)

 股票数据:

 1 import re
 2 import traceback
 3 
 4 import requests
 5 import sys
 6 from bs4 import BeautifulSoup
 7 
 8 
 9 def get_html_text(url, code='utf-8'):
10     headers ={'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'}
11     try:
12         r = requests.get(url, timeout=20, headers=headers)
13         r.raise_for_status()
14         r.encoding = code
15         return r.text
16     except:
17         return ""
18 
19 def get_stock_list(lst, stock_url):
20     html = get_html_text(stock_url, 'GB2312')
21     soup = BeautifulSoup(html, 'html.parser')
22     a = soup.find_all('a')
23     for i in a:
24         stock_code = re.findall(r'[s][hz]\d{6}', str(i))
25         if len(stock_code) != 0:
26             lst.append(stock_code)
27 
28 
29 def get_stock_info(lst, stock_url, fpath):
30     count = 0
31     for stock in lst:
32         url = stock_url + stock[0] + '.html'
33         print(url)
34         html = get_html_text(url)
35         try:
36             if html == "":
37                 continue
38             info_dict = {}
39             soup = BeautifulSoup(html, 'html.parser')
40             stock_info = soup.find('div', attrs={'class': 'stock-bets'})
41             info_dict.update({'股票名称': stock_info.text.split()[0]})
42 
43             key_list = stock_info.find_all('dt')
44             value_list = stock_info.find_all('dd')
45             for i in range(len(key_list)):
46                 key = key_list[i].text
47                 info_dict[key] = value_list[i].text
48 
49             with open(fpath, 'a', encoding='utf-8') as f:
50                 f.write(str(info_dict) + '\n')
51                 count = count + 1
52                 print("\r当前进度: {:.2f}%".format(count*100/len(lst), end=""))
53         except:
54             traceback.print_exc(file=sys.stdout)
55             count = count + 1
56             print("\r当前进度: {:.2f}%".format(count * 100 / len(lst), end=""))
57             continue
58 
59 def main():
60     stock_list_url = 'http://quote.eastmoney.com/stocklist.html'
61     stock_info_url = 'http://gupiao.baidu.com/stock/'
62     output_file = 'D:/BaiduStockInfo.txt'
63     slist = []
64     get_stock_list(slist, stock_list_url)
65     get_stock_info(slist, stock_info_url, output_file)

 

posted @ 2018-01-25 21:28  jonm  阅读(582)  评论(0编辑  收藏  举报