数据集_for循环出来的处理


from selenium.webdriver import Chrome
from lxml import etree
# from mongodb import add_many # 调用自己写的mongodb增删改查,保存数据前,要修改mongodb增删改查里面的库名
import pymysql


# 拿到页面源代码
def get_page_source(url):
web.implicitly_wait(10)
web.get(url)
page_source = web.page_source
web.quit() # 关闭浏览器
# print(page_source)
return page_source # 拿到页面源代码


# 解析
def parse(html):
tree = etree.HTML(page_source)
li_list = tree.xpath('//*[@id="content"]/div[1]/ul/li')
# 直接复制xpath 稍微修改哈,li[1]改成li 后面加个text()
result = [] # 数据集,统一返回
for li in li_list:
# print(li)
title = li.xpath('./div[1]/div[1]/a/text()')[0] # 返回的是个列表 [] ,而且只有一条数据,所以取第0个
position = "-".join(li.xpath('./div[1]/div[2]/div/a/text()')).replace(" ", "") # 返回是列表,里面有2条数据 "-".join() 用-去合并 .replace(" ", "") 去掉中间的空格
house = li.xpath('./div[1]/div[3]/div/text()')[0]
tags = li.xpath('./div[1]/div[5]/span/text()') # 返回是列表,里面有三条数据,全拿,后面不能加[]
# house = house.split("|")
# huxing, mianji, chaoxiang, zhuangxiu, louceng, nianfen, jiegou = house # 少一个报错的代码
# ValueError: not enough values to unpack (expected 7, got 6)
if len(house) == 6: # house返回的是个列表,里面少个年份的值, 要报错, 判断哈,少一个就添加一个年份的值为”“ 空
house.insert(5, "") # huxing, mianji, chaoxiang, zhuangxiu, louceng, nianfen, 列表从0开始数,到 nianfen 就是5
if len(house) == 8:
house.pop() # 有些数据是"别墅",弹出去,不要的意思,始终保持7个数据
# 直接解构成变量
# huxing, mianji, chaoxiang, zhuangxiu, louceng, nianfen, jiegou = house
# dic = { # TypeError: unhashable type: 'list' 这个报错,是因为字典的键没有加引号
# "title": title,
# "position": position,
# "huxing": huxing,
# "mianji": mianji,
# "chaoxiang": chaoxiang,
# "zhuangxiu": zhuangxiu,
# "louceng": louceng,
# "nianfen": nianfen,
# "jiegou": jiegou,
# "tags": tags
# }
# result.append(dic)
print(type(title))
return result # [{}, {}, {}]


# 保存到mongodb
def save_to_mongo(data_list):
add_many("ershoufang", data_list)
print("一页保存完毕")


def save_to_mysql(data_list):
try:
conn = pymysql.connect(host="localhost", port=3306, user="root", password="1234", database="spider")
cursor = conn.cursor()
# 这里的%s只是占位,10个%s 数名字 print("我爱%s" % "黎明")
# 1054, "Unknown column 'posit' in 'field list'" 报错 该列名在数据表中不存在,也就是SQL语句中的列明写错了
sql = "insert into ershoufang(title,position,huxing,mianji,chaoxiang,zhuangxiu,louceng,nianfen,jiegou,tags) values (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)"
# 处理一下 tags 把列表变成字符串
for dic in data_list:
dic['tags'] = ",".join(dic['tags'])
# 将数据储存成[(), (), ()]
# lst = [tuple(dic.values()) for dic in data_list] # 列表生成器
lst = (tuple(dic.values()) for dic in data_list) # 和上一行代码一个意思,这行效率要高些
print(lst)

cursor.executemany(sql, lst) # executemany 执行一堆 [(),(),()]
conn.commit()
except Exception as e:
print(e)
conn.rollback()
finally:
if cursor:
cursor.close()
if conn:
conn.close()


if __name__ == '__main__':
# for i in range(31, 33): # 抓30页数据
# url = f'https://bj.lianjia.com/ershoufang/pg{i}/'
# time.sleep(2)
url = "https://bj.lianjia.com/ershoufang/pg18/"
web = Chrome()
page_source = get_page_source(url)
data_list = parse(page_source)
# 存储数据 先进入保存的库 use haha 查看数据 db.ershoufang.find() ,看更多数据 it 看一共有多少条数据 db.ershoufang.count()
# save_to_mongo(data_list)
# print(data_list)
# save_to_mysql(data_list)


posted @ 2023-08-16 15:55  严永富  阅读(11)  评论(0)    收藏  举报