# coding:utf-8
# 导入所需的开发模块
import requests
import re
import pymysql

# 创建循环链接
urls = []
for i in list(range(1, 100)):
    urls.append(
        'https://rate.tmall.com/list_detail_rate.htm?itemId=571551508744&spuId=990279334&sellerId=420567757&order=3&currentPage=%s' % i)

# 构建字段容器
name = []
time = []
ratecontent = []

# 循环抓取数据
for url in urls:
    content = requests.get(url).text
    print(content)
    # 借助正则表达式使用findall进行匹配查询
    name.extend(re.findall(re.compile('"displayUserNick":"(.*?)","displayUserNumId"'), content))
    ratecontent.extend(re.findall(re.compile('"rateContent":"(.*?)","rateDate"'), content))
    time.extend(re.findall(re.compile('"rateDate":"(.*?)","reply"'), content))

con = pymysql.connect('localhost', 'root', 'zn025425', 'lishuisg', charset='utf8')
cur = con.cursor()
# cur.execute("CREATE TABLE TAOBAO(NAME VARCHAR(30),TIME VARCHAR(30),RATECONTENT VARCHAR(100))")
for i in range(len(name)):
    sql = "insert into taobao VALUES(%s,%s,%s)"
    cur.execute(sql, (name[i], time[i], ratecontent[i]))
    con.commit()
cur.close()
con.close()
登录
https://www.cnblogs.com/4wheel/p/9251463.html

# coding:utf-8
# 导入所需的开发模块
import requests
import re
import json
from lxml import etree
from ProductInfo import ProductInfo

#对值进行非空处理
def isEmpty(input,defautValue):
    if input == None or input =='':
        return defautValue
    else:
        return input

def getSubinfo(product_info,indexStr):
    parts = product_info.xpath("li[contains(text(),'askjdhkjh')]")
    length = len(parts)
    if length > 0:
        print(length)
        return parts[0].text
    else:
        return None



# url获取信息，xpath截取
url = "https://detail.tmall.com/item.htm?spm=a220m.1000858.1000725.2.4c97f976iVkTcV&id=527796865234&skuId=3715420611025&user_id=2816031767&cat_id=2&is_b=1&rn=17d083d0e074321ddd73008934f56c3a"
header = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'}
page = requests.get(url,headers = header).content.decode("gbk")
tree = etree.HTML(page)
product_infos = tree.xpath('//ul[@id="J_AttrUL"]')
for product_info in product_infos:
    # 对产品信息进行切割获取 产品名称
    productName = getSubinfo(product_info,'nihao')
    productPrice = 100;
    # 对产品信息进行切割获取 产品型号
    productModel = tree.xpath(u"//li[contains(text(),'型号:')]")[0].text
p = ProductInfo(productName, productPrice, productModel);

print(isEmpty(p.productName,'无')+';'+ isEmpty(str(p.productPrice),'无') +';'+ isEmpty(p.productModel,'无'))




-------------------------

class ProductInfo:
    '所有商品的基类'

    def __init__(self, productName, productPrice, productModel):
        self.productName = productName
        self.productPrice = productPrice
        self.productModel = productModel

    def displayProduct(self):
        print
        "productName : ", self.productName, ", productPrice: ", self.productPrice,", productModel: ", self.productModel





-----------------------------

# coding:utf-8
# 导入所需的开发模块
import csv

import requests
import math
import json
import urllib.parse
from lxml import etree

#获取集合的第一个元素函数
def getfirst(obj):
    if len(obj) > 0:
        return obj[0].strip()
    else:
        return "无"

header = {
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
    'Accept-Encoding': 'gzip, deflate, sdch',
    'Accept-Language': 'zh-CN,zh;q=0.8',
    'Connection': 'keep-alive',
    'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.235'
}
#获取url的操作步骤-》拼接#########
#1、获得搜索页面
def search_info(info):
    urlcode = urllib.parse.quote(info)
    search_url = u"https://search.suning.com/"+urlcode+"/"
    search_page = requests.get(search_url, headers=header).content.decode("utf-8")
    return search_page

#1、获得搜索页面
search_page = search_info("松下空气净化器")

#2、截取商品id拼接成url
search_tree = etree.HTML(search_page)
url_set = set()
sa_datas = search_tree.xpath('.//div[@class="title-selling-point"]/a/@sa-data')
for sa_data in sa_datas:
    sa_data_json = json.loads(sa_data.replace("\'", "\""))
    prdid = sa_data_json['prdid']
    shopid = sa_data_json['shopid']
    product_url = u"https://product.suning.com/"+shopid+"/"+prdid+".html#pro_detail_tab"
    #print(prdid.zfill(18))
    #3、获取评论url
    product_page = requests.get(product_url, headers=header).content.decode("utf-8")
    product_tree = etree.HTML(product_page)
    pingluns_url = getfirst(product_tree.xpath('.//a[text()="查看全部评论"]/@href'))
    url_set.add(pingluns_url)
    print(product_page)




#https://review.suning.com/cluster_cmmdty_review/general-30048005-000000000101340755-0070086354-1
page = 1
#苏宁电器
url = "https://review.suning.com/cluster_cmmdty_review/general-30200492-000000000132186273-0000000000-"+str(page)+"-total.htm"
#header = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'}

#写入excel操作
out = open('D:/suning_pingjia.csv','a', newline='')
csv_write = csv.writer(out,dialect='excel')
page = requests.get(url,headers = header).content.decode("utf-8")
tree = etree.HTML(page)

#获取评论总条数和页数
totalnum = tree.xpath('.//a[@class="rv-maidian"]/p[text()="全部"]/span/text()')
totalPages = math.ceil(int(getfirst(totalnum).strip('()'))/10)


# #标签
# select = tree.xpath('.//span[@class="on"]/text()')
# #用户
# users = tree.xpath('.//div[@class="username"]/span/text()')
# #评价
# achievements = tree.xpath('.//div[@class="topic-body"]/p[@class="body-content"]/text()')

#对所有的评论进行遍历获取
for page_id in range(49,49):
    url = "https://review.suning.com/cluster_cmmdty_review/general-30200492-000000000132186273-0000000000-" + str(
        page_id) + "-total.htm"
    page = requests.get(url, headers=header).content.decode("utf-8")
    tree = etree.HTML(page)
    # 遍历源码的标签
    fatherHtmls = tree.xpath('.//div[@class="rv-target-topic clearfix"]')
    true_url = getfirst(tree.xpath('.//link[@rel="canonical"]/@href')).replace("http://", "https://")
    if true_url != url:
        continue
    csv_write.writerow(["评论路径", true_url])
    for one in fatherHtmls:
        username = getfirst(one.xpath('.//div[@class="username"]/span/text()'))
        remark = getfirst(one.xpath('.//span[@class="on"]/text()'))
        achievement = getfirst(one.xpath('.//div[@class="topic-body"]/p[@class="body-content"]/text()'))
        print("用户：", username, "标签", remark, "评论", achievement)
        infos = [username, remark, achievement]
        csv_write.writerow(infos)

posted on 2019-04-03 00:27 xmanman 阅读(211) 评论(0) 收藏举报

刷新页面返回顶部

导航