xmanman

导航

 
# coding:utf-8
# 导入所需的开发模块
import requests
import re
import pymysql

# 创建循环链接
urls = []
for i in list(range(1, 100)):
urls.append(
'https://rate.tmall.com/list_detail_rate.htm?itemId=571551508744&spuId=990279334&sellerId=420567757&order=3&currentPage=%s' % i)

# 构建字段容器
name = []
time = []
ratecontent = []

# 循环抓取数据
for url in urls:
content = requests.get(url).text
print(content)
# 借助正则表达式使用findall进行匹配查询
name.extend(re.findall(re.compile('"displayUserNick":"(.*?)","displayUserNumId"'), content))
ratecontent.extend(re.findall(re.compile('"rateContent":"(.*?)","rateDate"'), content))
time.extend(re.findall(re.compile('"rateDate":"(.*?)","reply"'), content))

con = pymysql.connect('localhost', 'root', 'zn025425', 'lishuisg', charset='utf8')
cur = con.cursor()
# cur.execute("CREATE TABLE TAOBAO(NAME VARCHAR(30),TIME VARCHAR(30),RATECONTENT VARCHAR(100))")
for i in range(len(name)):
sql = "insert into taobao VALUES(%s,%s,%s)"
cur.execute(sql, (name[i], time[i], ratecontent[i]))
con.commit()
cur.close()
con.close()
登录
https://www.cnblogs.com/4wheel/p/9251463.html



# coding:utf-8
# 导入所需的开发模块
import requests
import re
import json
from lxml import etree
from ProductInfo import ProductInfo

#对值进行非空处理
def isEmpty(input,defautValue):
if input == None or input =='':
return defautValue
else:
return input

def getSubinfo(product_info,indexStr):
parts = product_info.xpath("li[contains(text(),'askjdhkjh')]")
length = len(parts)
if length > 0:
print(length)
return parts[0].text
else:
return None



# url获取信息,xpath截取
url = "https://detail.tmall.com/item.htm?spm=a220m.1000858.1000725.2.4c97f976iVkTcV&id=527796865234&skuId=3715420611025&user_id=2816031767&cat_id=2&is_b=1&rn=17d083d0e074321ddd73008934f56c3a"
header = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'}
page = requests.get(url,headers = header).content.decode("gbk")
tree = etree.HTML(page)
product_infos = tree.xpath('//ul[@id="J_AttrUL"]')
for product_info in product_infos:
# 对产品信息进行切割获取 产品名称
productName = getSubinfo(product_info,'nihao')
productPrice = 100;
# 对产品信息进行切割获取 产品型号
productModel = tree.xpath(u"//li[contains(text(),'型号:')]")[0].text
p = ProductInfo(productName, productPrice, productModel);

print(isEmpty(p.productName,'无')+';'+ isEmpty(str(p.productPrice),'无') +';'+ isEmpty(p.productModel,'无'))





-------------------------

class ProductInfo:
'所有商品的基类'

def __init__(self, productName, productPrice, productModel):
self.productName = productName
self.productPrice = productPrice
self.productModel = productModel

def displayProduct(self):
print
"productName : ", self.productName, ", productPrice: ", self.productPrice,", productModel: ", self.productModel




-----------------------------


# coding:utf-8
# 导入所需的开发模块
import csv

import requests
import math
import json
import urllib.parse
from lxml import etree

#获取集合的第一个元素函数
def getfirst(obj):
if len(obj) > 0:
return obj[0].strip()
else:
return "无"

header = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Encoding': 'gzip, deflate, sdch',
'Accept-Language': 'zh-CN,zh;q=0.8',
'Connection': 'keep-alive',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.235'
}
#获取url的操作步骤-》拼接#########
#1、获得搜索页面
def search_info(info):
urlcode = urllib.parse.quote(info)
search_url = u"https://search.suning.com/"+urlcode+"/"
search_page = requests.get(search_url, headers=header).content.decode("utf-8")
return search_page

#1、获得搜索页面
search_page = search_info("松下空气净化器")

#2、截取商品id拼接成url
search_tree = etree.HTML(search_page)
url_set = set()
sa_datas = search_tree.xpath('.//div[@class="title-selling-point"]/a/@sa-data')
for sa_data in sa_datas:
sa_data_json = json.loads(sa_data.replace("\'", "\""))
prdid = sa_data_json['prdid']
shopid = sa_data_json['shopid']
product_url = u"https://product.suning.com/"+shopid+"/"+prdid+".html#pro_detail_tab"
#print(prdid.zfill(18))
#3、获取评论url
product_page = requests.get(product_url, headers=header).content.decode("utf-8")
product_tree = etree.HTML(product_page)
pingluns_url = getfirst(product_tree.xpath('.//a[text()="查看全部评论"]/@href'))
url_set.add(pingluns_url)
print(product_page)




#https://review.suning.com/cluster_cmmdty_review/general-30048005-000000000101340755-0070086354-1
page = 1
#苏宁电器
url = "https://review.suning.com/cluster_cmmdty_review/general-30200492-000000000132186273-0000000000-"+str(page)+"-total.htm"
#header = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'}

#写入excel操作
out = open('D:/suning_pingjia.csv','a', newline='')
csv_write = csv.writer(out,dialect='excel')
page = requests.get(url,headers = header).content.decode("utf-8")
tree = etree.HTML(page)

#获取评论总条数和页数
totalnum = tree.xpath('.//a[@class="rv-maidian"]/p[text()="全部"]/span/text()')
totalPages = math.ceil(int(getfirst(totalnum).strip('()'))/10)


# #标签
# select = tree.xpath('.//span[@class="on"]/text()')
# #用户
# users = tree.xpath('.//div[@class="username"]/span/text()')
# #评价
# achievements = tree.xpath('.//div[@class="topic-body"]/p[@class="body-content"]/text()')

#对所有的评论进行遍历获取
for page_id in range(49,49):
url = "https://review.suning.com/cluster_cmmdty_review/general-30200492-000000000132186273-0000000000-" + str(
page_id) + "-total.htm"
page = requests.get(url, headers=header).content.decode("utf-8")
tree = etree.HTML(page)
# 遍历源码的标签
fatherHtmls = tree.xpath('.//div[@class="rv-target-topic clearfix"]')
true_url = getfirst(tree.xpath('.//link[@rel="canonical"]/@href')).replace("http://", "https://")
if true_url != url:
continue
csv_write.writerow(["评论路径", true_url])
for one in fatherHtmls:
username = getfirst(one.xpath('.//div[@class="username"]/span/text()'))
remark = getfirst(one.xpath('.//span[@class="on"]/text()'))
achievement = getfirst(one.xpath('.//div[@class="topic-body"]/p[@class="body-content"]/text()'))
print("用户:", username, "标签", remark, "评论", achievement)
infos = [username, remark, achievement]
csv_write.writerow(infos)



posted on 2019-04-03 00:27  xmanman  阅读(210)  评论(0)    收藏  举报