保存花椒直播数据
1 """ 2 个人主页:http://www.huajiao.com/user/52231287 3 #set()数据类型应该用add添加 4 find_all的用法 5 正则的用法 6 response.text 和content的区别 7 """ 8 from bs4 import BeautifulSoup 9 import requests 10 from requests.exceptions import RequestException 11 import re 12 import pymysql 13 from config import * 14 15 conn = pymysql.connect( 16 #HOST,PORT,USER,PASSWORD, 17 host = '127.0.0.1', 18 port = 3307, 19 user = 'root', 20 password = 'root', 21 db = 'songqin', 22 charset = 'utf8mb4', 23 #cursorclass = pymysql.cursors.DictCursor 24 ) 25 #获取一个游标 26 cursor = conn.cursor() 27 28 def get_one_page(url): 29 liveIds = set() 30 try: 31 response = requests.get(url) 32 soup = BeautifulSoup(response.text, 'html.parser') 33 for link in soup.find_all('a',href=re.compile('^(/l/)') ): 34 href = link.attrs['href'][3:] #link是标签,取它的属性 35 liveIds.add(href) 36 37 38 """ 39 这里也可以另外一种方式 40 href = link.attrs['href'] 41 liveId = re.findall('(\d+)', href) #正则匹配出来的结果是一个list,因此要[0] 42 liveIds.append(liveId[0]) 43 44 """ 45 return liveIds 46 except RequestException: 47 return None 48 def get_userId(liveId): 49 response = requests.get('http://www.huajiao.com/l/{}'.format(liveId)) 50 soup = BeautifulSoup(response.text, 'html.parser') 51 text = soup.title.get_text() 52 # print(text) 53 userId = re.findall('(\w+:\d+)', text) 54 return userId[0][8:] 55 56 def get_userData(userId): 57 response = requests.get('http://www.huajiao.com/user/{}'.format(userId)) 58 soup = BeautifulSoup(response.text, 'html.parser') 59 userInfo = soup.find('div', {'id':'userInfo'}) 60 image = userInfo.find('div', {'class':'avatar'}).img.attrs['src'] #class 值写全却不行 额谔谔 61 data = {} 62 data['image'] = image 63 #print(image) 64 tmp = userInfo.h3.get_text('|', strip = True).split('|') #我桃🍑|花椒号:65245592|四川 成都 65 data['username'] = tmp[0] 66 data['address'] = tmp[2] 67 tmp1 = userInfo.find('ul',{'class':'clearfix'}).get_text('|', strip=True).split('|') 68 data['fans'] = tmp1[0] 69 data['gift'] = tmp1[4] 70 print("正在插入", data) 71 save_to_mysql(data) 72 print('\n') 73 74 def save_to_mysql(result): 75 try: 76 sql_insert = "insert into huajiao1(image,gift,address,username,fans) values('%s','%s','%s','%s','%s') "%(result['image'],result['gift'],result['address'],result['username'],result['fans']) 77 if cursor.execute(sql_insert): 78 print("插入成功", result) 79 conn.commit() 80 except Exception as e: 81 print(e) 82 print("存储失败",result) 83 84 85 def main(): 86 for i in get_one_page('http://www.huajiao.com/category/1000'): 87 user_Id = get_userId(i) 88 get_userData(user_Id) 89 90 if __name__ == '__main__': 91 main()
结果:

第一次,是存储失败,提示(Incorrect string value: '\xF0\x9F\x98\x82\xF0\x9F...'),后来修改数据库表中的username列的编码为

才能将这条存储成功,但是存储后,相对应的符号变成了问号。
没有解决的问题:就是在数据库存储 emoji 表情符号时,涉及到字符编码的问题,导致
存储到数据库都变成了
但是我今天没有修改,查阅到是字符编码的问题。
查到相关博客:http://www.cnblogs.com/h--d/p/5712490.html
http://blog.csdn.net/fhzaitian/article/details/53168551

浙公网安备 33010602011771号