import requests from bs4 import BeautifulSoup import lxml import re import time import random import pymysql.cursors from selenium import webdriver import pandas import numpy import winsound ues_age=["Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.163 Safari/535.1","Mozilla/5.0 (Windows NT 6.1; WOW64; rv:6.0) Gecko/20100101 Firefox/6.0,""Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50","Opera/9.80 (Windows NT 6.1; U; zh-cn) Presto/2.9.168 Version/11.50","Mozilla/5.0 (Windows; U; Windows NT 6.1; ) AppleWebKit/534.12 (KHTML, like Gecko) Maxthon/3.0 Safari/534.12","Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.3; .NET4.0C; .NET4.0E)","Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.3; .NET4.0C; .NET4.0E; SE 2.X MetaSr 1.0)","Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.3; .NET4.0C; .NET4.0E)","Mozilla/5.0 (Windows NT 6.1) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.41 Safari/535.1 QQBrowser/6.9.11079.201","Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0)"] payload = { "Ancoding":"gzip, deflate, sdch, br", "Accept-Language":"zh-CN,zh;q=0.8", "Connection":"keep-alive", "Cookie":"hng=; uss=UIMY14A%2B04Bbq%2BqRxS6C9OzJWudsw14Q1kb5mDDqxW%2BQ3YG%2BUcpgrDRWnRQ%3D; uc3=sg2=AC4AfXCJ7XkLw0gCUD1tD9ZxhXFdweN2A6VfybWadxI%3D&nk2=&id2=&lg2=; t=3c0787f77a28e0854ef28fc360b2c555; cookie2=1c912d33e44bdb2008763748702a61f4; _tb_token_=78577371d8136; l=AiQkmjyCyPnG7qTN1Iu5fBqvdCgWvUgn; isg=AvDwL_qYXdDeegACSXGXiIOKwb7f2NSDXgsSOepBvMsepZFPkkmkE0aNixo_; pnm_cku822=; cna=T7gREcWMLDsCAavWmjBJPJpS; Hm_lvt_c478afee593a872fd45cb9a0d7a9da3b=1495496950; Hm_lpvt_c478afee593a872fd45cb9a0d7a9da3b=1495496950", "Host":"tanggulake.tmall.com", "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36", "X-Requested-With":"XMLHttpRequest"} connection = pymysql.connect(host='localhost', user='root', password='123', db='aaa', charset='utf8mb4', cursorclass=pymysql.cursors.DictCursor) with connection.cursor() as cursor: sql = "select * from 竞店" cursor.execute(sql) shop_id = cursor.fetchall() connection.commit() #店铺列表 class SpiderProxy(object): headers = { "Host": "www.xicidaili.com", "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:47.0) Gecko/20100101 Firefox/47.0", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "Accept-Language": "en-US,en;q=0.5", "Accept-Encoding": "gzip, deflate", "Referer": "http://www.xicidaili.com/wt/1", } def __init__(self, session_url): self.req = requests.session() self.req.get(session_url) def get_pagesource(self, url): html = self.req.get(url, headers=self.headers) return html.content def get_all_proxy(self, url, n): data = [] for i in range(1, n): html = self.get_pagesource(url + str(i)) soup = BeautifulSoup(html, "lxml") table = soup.find('table', id="ip_list") for row in table.findAll("tr"): cells = row.findAll("td") tmp = [] for item in cells: tmp.append(item.find(text=True)) data.append(tmp[1:3]) return data def ip(c,b): ips=[] session_url = 'http://www.xicidaili.com/wt/1' url = 'http://www.xicidaili.com/wt/' p = SpiderProxy(session_url) proxy_ip = p.get_all_proxy(url,c) for item in proxy_ip: if item: a=item[0]+":"+item[1] proxies = { "http": a, "https": a, } try: proxy =requests.get("http://ip.chinaz.com/getip.aspx",proxies=proxies ,timeout=5) ips.append(a) if (len(ips) == b): break except Exception as e: continue return ips def pig(url): url_re = requests.get(url + "1", params=payload) soup = BeautifulSoup(url_re.text, "lxml") pig = soup.select("div > div > div > div > span:nth-of-type(1)") return (pig[2].text.split("/"))[1] def xingxi(x): ids=[] pigg=[] dates1=[] for pig_id in range(1,int(pig(x))+1): ur1 = x + str(pig_id) url_re1 = requests.get(ur1, params=payload) time.sleep(random.randrange(1,5)) soup = BeautifulSoup(url_re1.text, "lxml") date = soup.select("div > div > div > dl") for spid in date: ids.append(re.sub("\D", "", spid.get("data-id"))) date = soup.select("div > div > div > dl") imgs = soup.select("img") # 图片 for imgasd in imgs: w = imgasd.get("src") p = re.match(r".*//(.*?.jpg)", w) pigg.append(r"https://" + p.group(1)) shuju2 = pandas.DataFrame(pigg) shuju2 = shuju2.rename(columns={0: "图片链接"}) date = soup.select("div > div > div > dl") dated = soup.select("dl") # 获取网页信息 for i in dated: c = list(i.stripped_strings) # 删除空格 b = [elem for elem in c if elem != '¥'] # 过滤 dates1.append([b[0], b[2]]) shuju2 = pandas.DataFrame(pigg) shuju2 = shuju2.rename(columns={0: "图片链接"}) shuju3 = pandas.DataFrame(ids) shuju3 = shuju3.rename(columns={0: "id"}) shuju1 = pandas.DataFrame(dates1) # 写入 shuju1 = shuju1.rename(columns={0: "标题", 1: "价格"}) return pandas.concat([shuju1, shuju2, shuju3], axis=1) def how_much(ids,ip): head=['--ignore-ssl-errors=true', '--load-images=false'] driver = webdriver.PhantomJS(service_args=head) try: driver.get("http://item.taobao.com/item.htm?id=" + ids) time.sleep(random.randrange(1, 5)) date = driver.page_source except: driver.quit() driver = webdriver.PhantomJS(service_args=['--ignore-ssl-errors=true', '--load-images=false', '--proxy='+ip]) driver.get("http://item.taobao.com/item.htm?id=" +ids) date = driver.page_source soup = BeautifulSoup(date, "lxml") try: color = re.findall(r'<li title="(.*)">颜色分类|li title="(.*)">主要颜色', str(soup.select(".attributes-list"))) color = [i for i in color[0] if i] except: color = "null" try: leimu = soup.select(".tb-pine")[0].get("data-catid") except: leimu = "null" id_dress = "http://item.taobao.com/item.htm?id=" + ids return [color, leimu, id_dress] connection = pymysql.connect(host='localhost', user='root', password='123', db='aaa', charset='utf8mb4', cursorclass=pymysql.cursors.DictCursor) with connection.cursor() as cursor: sql = "select * from 竞店" cursor.execute(sql) shop_id = cursor.fetchall() connection.commit() ips=ip(4,30) ipss=0 proxies = { "http": ips[ipss], "https": ips[ipss], } for dress in shop_id: try: result=xingxi(dress["地址"]) except: payload["User-Agent"]=ues_age[random.randrange(0,len(ues_age)-1)] proxies = proxies ipss=ipss+1 result = xingxi(dress["地址"]) with connection.cursor() as cursor: # Create a new sql = 'select id from' + " " + dress["店铺名称"] cursor.execute(sql) fff = cursor.fetchall() fff = [i["id"] for i in fff] for w in result.values: if w[3] not in fff: sql = "INSERT INTO " + dress["店铺名称"] + "(`id`,图片链接,价格,标题) VALUES (%s,%s,%s,%s)" cursor.execute(sql, (w[3], w[2], w[1], w[0])) # connection is not autocommit by default. So you must commit to save # your changes. connection.commit() with connection.cursor() as cursor: # Create a new for i in shop_id: sql = 'select id from' + " " + i["店铺名称"] cursor.execute(sql) q = cursor.fetchall() ids = [i["id"] for i in q] for good_id in ids: try: dates = how_much(good_id,ips[ipss]) except: ipss=ipss+1 dates = how_much(good_id, ips[ipss]) with connection.cursor() as cursorss: # Create a new sql = "UPDATE "+ i["店铺名称"]+ " SET 颜色='%s',类目='%s',商品地址='%s' where id = '%s'" % (dates[0][0], dates[1], dates[2],good_id) print(sql) try: cursorss.execute(sql) except: print(good_id) continue connection.commit() connection.commit()
浙公网安备 33010602011771号