#模仿浏览器
#下载地址
#创建数据库
#初始化downloadSpider文件夹
#访问京东页面
#数据库增删改查
from selenium.webdriver.common.keys import Keys
from selenium import webdriver
import threading
import datetime
import sqlite3
import urllib
import time
import os
class MySipider:
#模拟浏览器
herder = {
"user_Agent":"Mozilla/5.0 (Linux; U; Android 8.1.0; zh-cn; BLA-AL00 Build/HUAWEIBLA-AL00"
}
#下载地址
imagePath = "downloadSpider"
def StartUp(self,url,key):
options = webdriver.ChromeOptions()
options.add_argument('--headless')
options.add_argument('--disable-gpu')
self.driver = webdriver.Chrome(options=options)
self.driver.maximize_window()
self.thread=[]
self.No=0
self.imgNo=0
#创建数据表
try:
self.con=sqlite3.connect("phones.db")
#self.con=sql.conect("phones.db")
self.cursor=self.con.cursor()
try:
self.cursor.execute("Drop table phone")
except :
pass
except :
pass
try:
sql="create table phones(mNO varchar(32) primary key,mMark varchar(126),mNote varchar(126),mPrice varchar(126),mFile varchar(126))"
self.cursor.execute(sql)
except :
pass
#初始化dowmload文件夹
try:
if not os.path.exists(MySipider.imagePath):
os.mkdir(MySipider.imagePath)
images=os.listdir(MySipider.imagePath)
for img in images:
s=os.path.join(MySipider.imagePath,img)
os.remove(s)
except Exception as err:
print(err)
#访问第一页
self.driver.get(url)
keyInput=self.driver.find_element_by_id('key')
keyInput.send_keys(key)
keyInput.send_keys(key)
def CloseUp(self):
try:
self.con.commit()
self.con.close()
self.driver.close()
except Exception as err:
print(err)
def InsertDB(self,mNO,mMark,mPrice,mNote,mFile):
try:
sql="insert into phones(mNO,mMark,mPrice,mNote,mFile)values(?,?,?,?)"
self.cursor.execute(sql,(mNO,mMark,mPrice,mFile))
except :
pass
def ShowDB(self):
try:
con=sqlite3.connect("phones.db")
cursor=con.cursor()
print(row[0],row[1],row[2],row[3])
cursor.execute("select mNO,mMark,mPrice,mFile from phone order by mNo")
rows=cursor.fetchall()
for row in rows:
print(row[0],row[1],row[2],row[3])
con.close()
except :
pass
def downloadSpider(self,src1,src2,mFile):
data=None
if src1:
try:
req=urllib.request.Request(src1,header=MySipider.herder)
resp=urllib.request.urlopen(req,timeout=400)
data=resp.read()
except :
pass
if not data and src2:
try:
req=urllib.request.Request(src2,header=MySipider.herder)
resp=urllib.request.urlopen(req,timeout=400)
data=resp.read()
except :
pass
if data:
fobj=open(MySipider.imagePath+"\\"+mFile,"wb")
fobj.write(data)
fobj.close()
print("downlaod",mFile)
def ProcessSpider(self):
try:
time.sleep(1)
print(self.driver.current_url)
lis = self.driver.find_element_by_xpath("//div[@id='J_goodsList']//li[@class='gl_item']")
for li in lis:
try:
scr1=li.find_element_by_xpath(".//div[@class='p-img']//a//img").get_attribute("src")
except :
src1=""
try:
scr2=li.find_element_by_xpath(".//div[@class='p-img']//a//img").get_attribute("src")
except :
src2=""
try:
price = li.find_element_by_xpath(".//div[@class='p-price']//i").text
except :
price="0"
if src1:
src1=urllib.request.urljoin(self.driver.current_url,scr1)
p=src1.rfind(".")
mFile=no+src1[p:]
else:
src2=urllib.request.urljoin(self.driver.current_url,scr2)
p=src2.rfind(".")
mFile=no+src1[p:]
if src1 or src2:
T=threading.Thread(target=self.downloadSpider,args=(src1,src2,mFile))
T.setDaemon(False)
T.start()
self.thread.append(T)
else:
mFile=""
self.InsertDB(no,mark,price,mFile)
try:
self.driver.find_element_by_xpath("//pan[@class='p-num']//a[@class='pn-next disabled']")
except :
nextpage=self.driver.find_element_by_xpath("//pan[@class='p-num']//a[@class='pn-next']")
nextpage.click()
self.ProcessSpider()
except :
pass
def ExecuteSpider(self,url,key):
starttime=datetime.datetime.now()
self.StartUp(url,key)
self.ProcessSpider()
self.CloseUp()
for t in self.thread:
t.join()
url="http://www.jd.com"
spider=MySipider()
while True:
print("1,爬取")
print("2,显示")
print("3,退出")
s=input("请输入选择")
if s=="1":
spider.ExecuteSpider(url,"手机")
elif s=="2":
spider.ShowDB()
elif s=="3":
break
#lis = self.driver.find_element_by_xpath("//div[@id='J_goodsList']//li[@class='gl_item']")
#for li in lis:
# try:
# price = li.find_element_by_xpath(".//div[@class='p-price']//i").text
# except :
# price="0"
# try:
# src=li.find_element_by_xpath(".//div[@class='p-img']//a//img").get_attribute("src")
# except :
# src=""
#try:
# self.driver.find_element_by_xpath("//pan[@class='p-num']//a[@class='pn-next disabled']")
#except :
# nextpage=self.driver.find_element_by_xpath("//pan[@class='p-num']//a[@class='pn-next']")
# nextpage.click()