requests re正则匹配批量爬取 ip 经纬度 存入mysql中

# 安装 
pip install pymysql
pip install requests
import random

import json

import pymysql
import time
import requests

db
= pymysql.connect(host='xxx',user='xxx',password='xxx',database='xxx') cur = db.cursor() #请求地址 # proxies = { # "http" : "//xxx.xx.xx.xxx:xxx", # 'https' : "//xxx.xx.xx.xxx:xxx", # } user_agent_list = [ "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 " "(KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1", "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 " "(KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 " "(KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6", "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 " "(KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6", "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 " "(KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 " "(KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5", "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 " "(KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 " "(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 " "(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 " "(KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 " "(KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 " "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 " "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", ] headers = { 'Connection': 'close', "User-Agent": random.choice(user_agent_list) } # 根据ip查询国家 # global ws Country = [] RegionName = [] City = [] Lat = [] Lon = [] def ip_map(ip): url = f"http://ip-api.com/json/{ip}?fields=61439&lang=zh-CN" response = requests.get(url=url, headers=headers, timeout=5) con = response.text res = json.loads(con) #print(res) # 国家 country = res['country'] # print(country) Country.append(country) regionName = res['regionName'] # print(regionName) RegionName.append(regionName) # 那个城市 city = res['city'] # print(city) City.append(city) # 纬度 lat = res['lat'] # print(lat) Lat.append(lat) # 经度 lon = res['lon'] # print(lon) Lon.append(lon) query = res['query'] # data = { # "国家" : country, # "州" :regionName, # '城市': city, # '纬度' :lat, # "经度" : lon # } sql = "insert into [table_name](query,country,regionName,city,lat,lon) values(%s,%s,%s,%s,%s,%s)" # print(sql) # 执行sql cur.executemany(sql, [(query,country, regionName, city, lat, lon)]) # 提交事务 con.commit() print('插入成功') con.rollback() con.close() # datas = json.dumps(data) # with open("ip_info.txt",'a',encoding='utf-8' + '\n') as f: # f.write(datas) if __name__ == '__main__': import re ip_list = [] f = open("xxxx.txt") conten = f.read() # print(conten) # # 分为开头,中间和结尾三部分,提取可能包含ip地址的字符串 # # 匹配中间部分的ip,返回列表 # result = re.findall(r'\D(?:\d{1,3}\.){3}(?:25[0-5]|2[0-4]\d|1\d{2}|[1-9]?\d)\D', conten) # # print(result) # # # # # 匹配开头可能出现ips # ret_start = re.match(r'(\d{1,3}\.){3}(?:25[0-5]|2[0-4]\d|1\d{2}|[1-9]?\d)\D', conten) # if ret_start: # result.append(ret_start.group()) # print("start:", ret_start.group()) # # result.append(ret_start.group()) # # # # # 匹配结尾 # ret_end = re.search(r'\D(\d{1,3}\.){3}(25[0-5]|2[0-4]\d|1\d{2}|[1-9]?\d)$',conten) # if ret_end: # print("end: ",ret_end.group()) # result.append(ret_end.group()) # 正则提取ip ret = re.findall(r"\b(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\b", conten) if ret: # 匹配成功则将ip地址添加到列表中 ip_list.append(ret) # 输入结果列表 set_list = ip_list[0] # [] # print(set_list) for i in set_list: ip_all = i time.sleep(6) try: ip_map(ip_all) except: pass # print("在等等%s)

声名:本博客仅提供学习研究使用 违法或其他用途与本站无关 

   不可私自转发!

posted @ 2020-07-16 14:14  Handsome、Snake  阅读(185)  评论(0编辑  收藏  举报