[网摘]Python 爬虫实例(6)—— 爬取蚂蚁免费代理
本文摘自:https://www.cnblogs.com/xuchunlin/p/6774414.html

数据库表sql语句:
CREATE TABLE `free_ip` ( `free_ip_id` int(11) NOT NULL AUTO_INCREMENT COMMENT '主键', `ip` varchar(255) DEFAULT NULL COMMENT 'ip地址', `port` varchar(255) DEFAULT NULL COMMENT '端口', `yini_class` varchar(255) DEFAULT NULL COMMENT '匿名等级', `http_type` varchar(255) DEFAULT NULL COMMENT '代理类型', `response_time` varchar(255) DEFAULT NULL COMMENT '响应时间', `address` varchar(255) DEFAULT NULL COMMENT '地理位置', `validate_time` varchar(255) DEFAULT NULL COMMENT '最近验证时间', `hashcode` varchar(255) DEFAULT NULL COMMENT '去重', PRIMARY KEY (`free_ip_id`), UNIQUE KEY `hashcode` (`hashcode`) USING BTREE ) ENGINE=InnoDB AUTO_INCREMENT=4220 DEFAULT CHARSET=utf8;
源代码:
# coding:utf-8
import random, re
import sqlite3
import json, time
import uuid
from bs4 import BeautifulSoup
import threading
import requests
import MySQLdb
from lxml import etree
import urllib3
urllib3.disable_warnings()
import urllib2
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
session = requests.session()
import logging
import logging.handlers
import platform
sysStr = platform.system()
if sysStr =="Windows":
LOG_FILE_check = 'H:\\log\\log.txt'
else:
LOG_FILE_check = '/log/wlb/crawler/cic.log'
handler = logging.handlers.RotatingFileHandler(LOG_FILE_check, maxBytes=128 * 1024 * 1024,backupCount=10) # 实例化handler 200M 最多十个文件
fmt = '\n' + '%(asctime)s - %(filename)s:%(lineno)s - %(message)s'
formatter = logging.Formatter(fmt) # 实例化formatter
handler.setFormatter(formatter) # 为handler添加formatter
logger = logging.getLogger('check') # 获取名为tst的logger
logger.addHandler(handler) # 为logger添加handler
logger.setLevel(logging.DEBUG)
def md5(str):
import hashlib
m = hashlib.md5()
m.update(str)
return m.hexdigest()
def freeIp():
for i in range(1,1000):
print "正在爬取的位置是:",i
url = "http://www.ip181.com/daili/" + str(i)+ ".html"
headers = {
"Host":"www.ip181.com",
"Connection":"keep-alive",
"Upgrade-Insecure-Requests":"1",
"User-Agent":"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.91 Safari/537.36",
"Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
"Referer":url,
"Accept-Encoding":"gzip, deflate",
"Accept-Language":"zh-CN,zh;q=0.8",
}
try:
result = session.get(url=url,headers=headers).text
result = result.encode('ISO-8859-1').decode(requests.utils.get_encodings_from_content(result)[0])
except:
result = session.get(url=url, headers=headers).text
result = result.encode('ISO-8859-1').decode(requests.utils.get_encodings_from_content(result)[0])
soup = BeautifulSoup(result, 'html.parser')
result_soup = soup.find_all("div", attrs={"class": "col-md-12"})[1]
result_soup = str(result_soup).replace('\r\n\t','').replace('\r\n','').replace('\n\t','').replace('\n','').replace(' class="warning"','')
result_soups = re.findall('最近验证时间</td></tr>(.*?)</tbody></table><div class="page">共',result_soup)[0]
print result_soups
result_list = re.findall('<tr><td>(.*?)</td><td>(.*?)</td><td>(.*?)</td><td>(.*?)</td><td>(.*?)</td><td>(.*?)</td><td>(.*?)</td></tr>',result_soups)
for item in result_list:
ip = item[0]
port = item[1]
yini_class = item[2]
http_type = item[3]
response_time = item[4]
address = item[5]
validate_time = item[6]
proxy = str(ip) + ":" + port
hashcode = md5(proxy)
try: # 此处是数据库连接,请换成自己的数据库
conn = MySQLdb.connect(host="110.110.110.717", user="lg", passwd="456", db="369",charset="utf8")
cursor = conn.cursor()
sql = """INSERT INTO free_ip (ip,port,yini_class,http_type,response_time,address,validate_time,hashcode)
VALUES (%s,%s,%s,%s,%s,%s,%s,%s)"""
params = (ip,port,yini_class,http_type,response_time,address,validate_time,hashcode)
cursor.execute(sql, params)
conn.commit()
cursor.close()
print " 插入成功 "
except Exception as e:
print "********插入失败********"
print e
freeIp()
爬取效果:

赠人玫瑰
手留余香
我们曾如此渴望命运的波澜,到最后才发现:人生最曼妙的风景,竟是内心的淡定与从容……我们曾如此期盼外界的认可,到最后才知道:世界是自己的,与他人毫无关系!-杨绛先生
如果,您希望更容易地发现我的新博客,不妨点击一下绿色通道的【关注我】。


浙公网安备 33010602011771号