#!/usr/bin/env python
# coding:utf8
# author:Z time:2018/8/14
import requests
import time
from bs4 import BeautifulSoup
import pymysql
from pymysql import OperationalError
from selenium import webdriver
import logging
from ip_proxy.ip_proxy_filter import filter_db
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import WebDriverException
chrome_options = Options()
chrome_options.add_argument('window-size=1920x3000') #指定浏览器分辨率
chrome_options.add_argument('--disable-gpu') #谷歌文档提到需要加上这个属性来规避bug
chrome_options.add_argument('--hide-scrollbars') #隐藏滚动条, 应对一些特殊页面
chrome_options.add_argument('blink-settings=imagesEnabled=false') #不加载图片, 提升速度
chrome_options.add_argument('--headless') #浏览器不提供可视化页面. linux下如果系统不支持可视化不加这条会启动失败
# chrome_options.binary_location = r"C:\Program Files (x86)\Google\Chrome\Application\chrome.exe" #手动指定使用的浏览器位置
driver=webdriver.Chrome(chrome_options=chrome_options)
#
# driver=webdriver.Chrome()
#
logger=logging.getLogger(__file__)
def save_log():
# 2、filter对象:不常用,
# 3、Handler对象:接收logger传来的日志,然后控制输出
fh = logging.FileHandler('D:/py3code/jintong_day1/aaa/cnstock.log', encoding='utf8') # 创建一个handler,用于写入日志文件
ch = logging.StreamHandler() # 再创建一个handler,用于输出到控制台(终端)
# 4、formatter对象:日志格式
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s -%(module)s: %(message)s'
, datefmt='%Y-%m-%d %H:%M:%S %p', )
fh.setLevel(logging.DEBUG)
# 5、为Handler对象绑定格式
fh.setFormatter(formatter)
ch.setFormatter(formatter)
# 6、将Handler添加给logger并设置日志级别
logger.addHandler(fh) # logger对象可以添加多个fh和ch对象
logger.addHandler(ch)
logger.setLevel(20)
def save_db(details):
connect=pymysql.Connect(
host='127.0.0.1',
port=3306,
user='root',
password='',
db='haha',
charset='utf8'
)
cursor=connect.cursor()
title = details[0][0]
sql = """
select ip_address from ip_proxy WHERE ip_address='{}'
""".format(title)
# sql = """
# select news_url from (select news_url from news where to_days(crawl_datetime) = to_days(now())) as pdf_demo WHERE pdf_demo.news_url='{}'
# """.format(news_url)
# 插入数据
sql2 = """
INSERT INTO ip_proxy (ip_address,port,server_address,whether_anonymous,type_,live_time,proof_time)
VALUES ('%s','%s','%s','%s','%s','%s','%s')
"""
cursor.execute(sql)
title = cursor.fetchall()
details = details[0]
details[4] = pymysql.escape_string(details[4])
data = tuple(details)
try :
if len(title)==0:
cursor.execute(sql2 % data)
connect.commit()
logger.info('成功插入cnstock 1 条数据')
cursor.close()
connect.close()
else:
logger.info('已有数据')
cursor.close()
connect.close()
except OperationalError:
pass
def main():
while True:
# url = 'http://www.xicidaili.com/nn'
# driver.get(url)
# total_page = driver.find_element_by_xpath('//*[@id="body"]/div[2]/a[10]').text
#
# print(total_page)
# print(11111)
driver = webdriver.Chrome(chrome_options=chrome_options)
# print(22222)
for i in range(1, 50):
# print(33333)
url='http://www.xicidaili.com/nn/{}'.format(i)
# print(44444)
try:
driver.get(url)
except WebDriverException as e:
print(e)
# print(55555)
tr_nums=driver.find_elements_by_xpath('//*[@id="ip_list"]/tbody/tr')
# print('fuckyouyou')
print(len(tr_nums))
# print('fuckyouyouyou')
save_log()
for i in range(2,len(tr_nums)+1):
# print(66666)
detail=[]
ip_address=driver.find_element_by_xpath('//*[@id="ip_list"]/tbody/tr['+str(i)+']/td[2]').text
port=driver.find_element_by_xpath('//*[@id="ip_list"]/tbody/tr['+str(i)+']/td[3]').text
server_address=driver.find_element_by_xpath('//*[@id="ip_list"]/tbody/tr['+str(i)+']/td[4]').text
whether_anonymous=driver.find_element_by_xpath('//*[@id="ip_list"]/tbody/tr['+str(i)+']/td[5]').text
type=driver.find_element_by_xpath('//*[@id="ip_list"]/tbody/tr['+str(i)+']/td[6]').text
live_time=driver.find_element_by_xpath('//*[@id="ip_list"]/tbody/tr['+str(i)+']/td[9]').text
proof_time=driver.find_element_by_xpath('//*[@id="ip_list"]/tbody/tr['+str(i)+']/td[10]').text
detail.append([ip_address,port,server_address,whether_anonymous,type,live_time,proof_time])
try:
save_db(detail)
except OperationalError as e:
print(e)
time.sleep(2)
try:
filter_db()
except OperationalError as e:
print(e)
driver.close()
time.sleep(300)
if __name__ == '__main__':
main()
#!/usr/bin/env python
# coding:utf8
# author:Z time:2018/8/15
import pymysql
import re
#删除大于6分钟存活时间的ip
def filter_db():
connect=pymysql.Connect(
host='127.0.0.1',
port=3306,
user='root',
password='',
db='haha',
charset='utf8'
)
sql="""
select live_time from ip_proxy
"""
cursor=connect.cursor()
cursor.execute(sql)
live_time=cursor.fetchall()
for i in live_time:
live_time_num=re.findall('(\d+)\D+',i[0])[0]
live_time_end=re.findall('\d+(\D+)',i[0])[0]
if not (int(live_time_num)<6 and live_time_end=='分钟'):
sql="""
delete from ip_proxy where live_time='{}'
""".format(i[0])
# print(sql)
cursor.execute(sql)
connect.commit()
cursor.close()
connect.close()
filter_db()