1 import urllib.request
2 import urllib.parse
3 import requests
4 from urllib.parse import urlencode
5 from pyquery import PyQuery as pq
6 from pymongo import MongoClient
7 import json
8
9
10 url = 'http://aibee.com/cn/joinus.aspx?action=jobinfo'
11
12 headers = {
13 'Host': 'aibee.com',
14 'Referer': 'http://aibee.com/cn/joinus.aspx',
15 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36',
16 'X-Requested-With': 'XMLHttpRequest',
17 }
18
19 client = MongoClient()
20 db = client['aibee']
21 collection = db['aibee']
22 max_id = 50
23
24 def get_page(id):
25
26 formData = {
27 'id': id,
28 }
29
30 #将str类型转换为bytes类型
31 data = urllib.parse.urlencode(formData).encode("utf-8")
32 request = urllib.request.Request(url, data=data, headers=headers)
33 response = urllib.request.urlopen(request)
34 #print(response.read().decode('utf-8'))
35 result = response.read().decode('utf-8')
36 #print(result)
37 #print(len(result))
38 #print(id)
39
40 if len(result)!=12:
41
42 # print(id)
43 content=result.replace(",",":")
44
45 id=content.split(':')[2].strip()
46 #print(id)
47 title=content.split(':')[4].strip()
48 #print(title)
49 zhize=content.split(':')[6].strip().replace("\t","").replace("<ul>","").replace("<li>","").replace("</li>","").replace("</ul>","").replace("\\t","")
50 #print(zhize)
51 yaoqiu=content.split(':')[8].strip().replace("\t","").replace("<ul>","").replace("<li>","").replace("</li>","").replace("</ul>","").replace("\\t","")
52 #print(yaoqiu)
53 dtt=content.split(':')[12].strip()[:-3]
54 #print(dtt)
55 emailaddr=content.split(':')[16].strip()
56 #print(emailaddr)
57
58
59 aibee={
60 'id':id,
61 'title':title,
62 'zhize':zhize,
63 'yaoqiu':yaoqiu,
64 'dtt': dtt,
65 'emailaddr': emailaddr
66 }
67 #print(aibee)
68 else:
69 aibee=0
70
71 return aibee
72
73
74 def write_to_file(content):
75 with open('aibee.json','a',encoding='utf-8') as f:
76 f.write(json.dumps(content,ensure_ascii=False)+'\n')
77 f.close()
78
79
80 def save_to_mongo(result):
81 if collection.insert(result):
82 print('Saved to Mongo')
83
84
85 if __name__ == "__main__":
86
87 for id in range(1, max_id + 1):
88 #get_page(id)
89 content = get_page(id)
90 if content!=0:
91 print(content)
92 write_to_file(content)
93 save_to_mongo(content)
94
