数据 - gaoxiangTOP

import requests
from bs4 import  BeautifulSoup
import lxml
import re
import time
import random
import pymysql.cursors
from selenium import webdriver
import pandas
import numpy
import winsound
ues_age=["Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.163 Safari/535.1","Mozilla/5.0 (Windows NT 6.1; WOW64; rv:6.0) Gecko/20100101 Firefox/6.0,""Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50","Opera/9.80 (Windows NT 6.1; U; zh-cn) Presto/2.9.168 Version/11.50","Mozilla/5.0 (Windows; U; Windows NT 6.1; ) AppleWebKit/534.12 (KHTML, like Gecko) Maxthon/3.0 Safari/534.12","Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.3; .NET4.0C; .NET4.0E)","Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.3; .NET4.0C; .NET4.0E; SE 2.X MetaSr 1.0)","Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.3; .NET4.0C; .NET4.0E)","Mozilla/5.0 (Windows NT 6.1) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.41 Safari/535.1 QQBrowser/6.9.11079.201","Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0)"]
payload = {
    "Ancoding":"gzip, deflate, sdch, br",
"Accept-Language":"zh-CN,zh;q=0.8",
"Connection":"keep-alive",
"Cookie":"hng=; uss=UIMY14A%2B04Bbq%2BqRxS6C9OzJWudsw14Q1kb5mDDqxW%2BQ3YG%2BUcpgrDRWnRQ%3D; uc3=sg2=AC4AfXCJ7XkLw0gCUD1tD9ZxhXFdweN2A6VfybWadxI%3D&nk2=&id2=&lg2=; t=3c0787f77a28e0854ef28fc360b2c555; cookie2=1c912d33e44bdb2008763748702a61f4; _tb_token_=78577371d8136; l=AiQkmjyCyPnG7qTN1Iu5fBqvdCgWvUgn; isg=AvDwL_qYXdDeegACSXGXiIOKwb7f2NSDXgsSOepBvMsepZFPkkmkE0aNixo_; pnm_cku822=; cna=T7gREcWMLDsCAavWmjBJPJpS; Hm_lvt_c478afee593a872fd45cb9a0d7a9da3b=1495496950; Hm_lpvt_c478afee593a872fd45cb9a0d7a9da3b=1495496950",
"Host":"tanggulake.tmall.com",
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36",
"X-Requested-With":"XMLHttpRequest"}

connection = pymysql.connect(host='localhost',
                             user='root',
                             password='123',
                             db='aaa',
                             charset='utf8mb4',
                             cursorclass=pymysql.cursors.DictCursor)
with connection.cursor() as cursor:
    sql = "select * from 竞店"
    cursor.execute(sql)
    shop_id = cursor.fetchall()
connection.commit()                             #店铺列表
class SpiderProxy(object):
    headers = {
        "Host": "www.xicidaili.com",
        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:47.0) Gecko/20100101 Firefox/47.0",
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
        "Accept-Language": "en-US,en;q=0.5",
        "Accept-Encoding": "gzip, deflate",
        "Referer": "http://www.xicidaili.com/wt/1",
    }

    def __init__(self, session_url):
        self.req = requests.session()
        self.req.get(session_url)

    def get_pagesource(self, url):
        html = self.req.get(url, headers=self.headers)
        return html.content

    def get_all_proxy(self, url, n):
        data = []
        for i in range(1, n):
            html = self.get_pagesource(url + str(i))
            soup = BeautifulSoup(html, "lxml")

            table = soup.find('table', id="ip_list")
            for row in table.findAll("tr"):
                cells = row.findAll("td")
                tmp = []
                for item in cells:

                    tmp.append(item.find(text=True))
                data.append(tmp[1:3])
        return data
def ip(c,b):
    ips=[]
    session_url = 'http://www.xicidaili.com/wt/1'
    url = 'http://www.xicidaili.com/wt/'
    p = SpiderProxy(session_url)
    proxy_ip = p.get_all_proxy(url,c)
    for item in proxy_ip:
        if item:
            a=item[0]+":"+item[1]
            proxies = {
                "http": a,
                "https": a,
            }
            try:
                proxy =requests.get("http://ip.chinaz.com/getip.aspx",proxies=proxies ,timeout=5)
                ips.append(a)
                if (len(ips) == b):
                    break
            except Exception as  e:
                continue
    return ips
def pig(url):
    url_re = requests.get(url + "1", params=payload)
    soup = BeautifulSoup(url_re.text, "lxml")
    pig = soup.select("div >  div > div > div > span:nth-of-type(1)")
    return (pig[2].text.split("/"))[1]
def xingxi(x):
    ids=[]
    pigg=[]
    dates1=[]
    for pig_id in range(1,int(pig(x))+1):
            ur1 = x + str(pig_id)
            url_re1 = requests.get(ur1, params=payload)
            time.sleep(random.randrange(1,5))
            soup = BeautifulSoup(url_re1.text, "lxml")
            date = soup.select("div > div > div > dl")
            for spid in date:
                ids.append(re.sub("\D", "", spid.get("data-id")))

            date = soup.select("div > div > div > dl")
            imgs = soup.select("img")  # 图片
            for imgasd in imgs:
                w = imgasd.get("src")
                p = re.match(r".*//(.*?.jpg)", w)
                pigg.append(r"https://" + p.group(1))
            shuju2 = pandas.DataFrame(pigg)
            shuju2 = shuju2.rename(columns={0: "图片链接"})
            date = soup.select("div > div > div > dl")
            dated = soup.select("dl")  # 获取网页信息
            for i in dated:
                c = list(i.stripped_strings)  # 删除空格
                b = [elem for elem in c if elem != '￥']  # 过滤
                dates1.append([b[0], b[2]])
    shuju2 = pandas.DataFrame(pigg)
    shuju2 = shuju2.rename(columns={0: "图片链接"})
    shuju3 = pandas.DataFrame(ids)
    shuju3 = shuju3.rename(columns={0: "id"})
    shuju1 = pandas.DataFrame(dates1)  # 写入
    shuju1 = shuju1.rename(columns={0: "标题", 1: "价格"})
    return pandas.concat([shuju1, shuju2, shuju3], axis=1)
def how_much(ids,ip):
    head=['--ignore-ssl-errors=true', '--load-images=false']
    driver = webdriver.PhantomJS(service_args=head)

    try:
        driver.get("http://item.taobao.com/item.htm?id=" + ids)
        time.sleep(random.randrange(1, 5))

        date = driver.page_source
    except:
        driver.quit()
        driver = webdriver.PhantomJS(service_args=['--ignore-ssl-errors=true', '--load-images=false', '--proxy='+ip])
        driver.get("http://item.taobao.com/item.htm?id=" +ids)
        date = driver.page_source
    soup = BeautifulSoup(date, "lxml")
    try:
        color = re.findall(r'<li title="(.*)">颜色分类|li title="(.*)">主要颜色',
                           str(soup.select(".attributes-list")))
        color = [i for i in color[0] if i]
    except:
        color = "null"
    try:
        leimu = soup.select(".tb-pine")[0].get("data-catid")
    except:
        leimu = "null"

    id_dress = "http://item.taobao.com/item.htm?id=" + ids
    return [color, leimu, id_dress]
connection = pymysql.connect(host='localhost',
                             user='root',
                             password='123',
                             db='aaa',
                             charset='utf8mb4',
                             cursorclass=pymysql.cursors.DictCursor)
with connection.cursor() as cursor:
    sql = "select * from 竞店"
    cursor.execute(sql)
    shop_id = cursor.fetchall()
connection.commit()

ips=ip(4,30)
ipss=0

proxies = {
                "http": ips[ipss],
                "https": ips[ipss],
            }

for dress in shop_id:
    try:
        result=xingxi(dress["地址"])
    except:
        payload["User-Agent"]=ues_age[random.randrange(0,len(ues_age)-1)]
        proxies = proxies
        ipss=ipss+1
        result = xingxi(dress["地址"])


    with connection.cursor() as cursor:
        # Create a new
        sql = 'select id from' + " " + dress["店铺名称"]
        cursor.execute(sql)
        fff = cursor.fetchall()
        fff = [i["id"] for i in fff]
        for w in result.values:
            if w[3] not in fff:
                sql = "INSERT INTO " + dress["店铺名称"] + "(`id`,图片链接,价格,标题) VALUES (%s,%s,%s,%s)"
                cursor.execute(sql, (w[3], w[2], w[1], w[0]))

                # connection is not autocommit by default. So you must commit to save
                # your changes.
    connection.commit()

with connection.cursor() as cursor:
# Create a new
    for i in shop_id:
        sql = 'select id from' + " " + i["店铺名称"]
        cursor.execute(sql)
        q = cursor.fetchall()
        ids = [i["id"] for i in q]
        for good_id in ids:
            try:
                dates = how_much(good_id,ips[ipss])
            except:
                ipss=ipss+1
                dates = how_much(good_id, ips[ipss])
            with connection.cursor() as cursorss:
                # Create a new
                sql = "UPDATE "+ i["店铺名称"]+ " SET 颜色='%s',类目='%s',商品地址='%s' where id = '%s'" % (dates[0][0], dates[1], dates[2],good_id)
                print(sql)
                try:
                    cursorss.execute(sql)
                except:
                    print(good_id)
                    continue
            connection.commit()
connection.commit()
posted on 2017-06-13 01:53 gaoxiangTOP 阅读(156) 评论(0) 收藏举报
刷新页面返回顶部