保存花椒直播数据

 1 """
 2 个人主页:http://www.huajiao.com/user/52231287
 3 #set()数据类型应该用add添加
 4 find_all的用法
 5 正则的用法
 6 response.text 和content的区别
 7 """
 8 from bs4 import BeautifulSoup
 9 import requests
10 from requests.exceptions import RequestException
11 import re
12 import pymysql
13 from config import *
14 
15 conn = pymysql.connect(
16     #HOST,PORT,USER,PASSWORD,
17     host = '127.0.0.1',
18     port = 3307,
19     user = 'root',
20     password = 'root',
21     db = 'songqin',
22     charset = 'utf8mb4',
23     #cursorclass = pymysql.cursors.DictCursor
24 )
25 #获取一个游标
26 cursor = conn.cursor()
27 
28 def get_one_page(url):
29     liveIds = set()
30     try:
31         response = requests.get(url)
32         soup = BeautifulSoup(response.text, 'html.parser')
33         for link in soup.find_all('a',href=re.compile('^(/l/)') ):
34             href = link.attrs['href'][3:] #link是标签,取它的属性
35             liveIds.add(href)
36 
37 
38             """
39             这里也可以另外一种方式
40             href = link.attrs['href']
41             liveId = re.findall('(\d+)', href)   #正则匹配出来的结果是一个list,因此要[0]
42             liveIds.append(liveId[0])
43 
44             """
45         return liveIds
46     except RequestException:
47         return None
48 def get_userId(liveId):
49     response = requests.get('http://www.huajiao.com/l/{}'.format(liveId))
50     soup = BeautifulSoup(response.text, 'html.parser')
51     text = soup.title.get_text()
52     # print(text)
53     userId = re.findall('(\w+:\d+)', text)
54     return  userId[0][8:]
55 
56 def get_userData(userId):
57     response = requests.get('http://www.huajiao.com/user/{}'.format(userId))
58     soup = BeautifulSoup(response.text, 'html.parser')
59     userInfo = soup.find('div', {'id':'userInfo'})
60     image = userInfo.find('div', {'class':'avatar'}).img.attrs['src'] #class 值写全却不行 额谔谔
61     data = {}
62     data['image'] = image
63     #print(image)
64     tmp = userInfo.h3.get_text('|', strip = True).split('|')  #我桃🍑|花椒号:65245592|四川 成都
65     data['username'] = tmp[0]
66     data['address'] = tmp[2]
67     tmp1 = userInfo.find('ul',{'class':'clearfix'}).get_text('|', strip=True).split('|')
68     data['fans'] = tmp1[0]
69     data['gift'] = tmp1[4]
70     print("正在插入", data)
71     save_to_mysql(data)
72     print('\n')
73 
74 def save_to_mysql(result):
75     try:
76         sql_insert = "insert into huajiao1(image,gift,address,username,fans) values('%s','%s','%s','%s','%s') "%(result['image'],result['gift'],result['address'],result['username'],result['fans'])
77         if cursor.execute(sql_insert):
78             print("插入成功", result)
79             conn.commit()
80     except Exception as e:
81         print(e)
82         print("存储失败",result)
83 
84 
85 def main():
86     for i in get_one_page('http://www.huajiao.com/category/1000'):
87         user_Id = get_userId(i)
88         get_userData(user_Id)
89 
90 if __name__ == '__main__':
91     main()
View Code

结果:

 

 第一次,是存储失败,提示(Incorrect string value: '\xF0\x9F\x98\x82\xF0\x9F...'),后来修改数据库表中的username列的编码为

才能将这条存储成功,但是存储后,相对应的符号变成了问号。

 

没有解决的问题:就是在数据库存储 emoji 表情符号时,涉及到字符编码的问题,导致

 存储到数据库都变成了

但是我今天没有修改,查阅到是字符编码的问题。

查到相关博客:http://www.cnblogs.com/h--d/p/5712490.html

http://blog.csdn.net/fhzaitian/article/details/53168551

 

posted @ 2017-10-15 22:01  我要成为女技术宅  阅读(770)  评论(0)    收藏  举报