python中使用pyspark 读取和整理日志数据并将数据写入到es中去

 

代码如下

import re
import datetime
from pyspark.sql import SparkSession
from pyspark import SparkContext
from elasticsearch import Elasticsearch
spark=SparkSession.builder.appName("lz").getOrCreate()
sc = SparkContext.getOrCreate()
es = Elasticsearch()
month_map = {'Jan': '1', 'Feb': '2', 'Mar':'3', 'Apr':'4', 'May':'5', 'Jun':'6', 'Jul':'7',
    'Aug':'8',  'Sep': '9', 'Oct':'10', 'Nov': '11', 'Dec': '12'}

log_data = sc.textFile("/Desktop/data_doc/data_Log/sshlogin/03.txt") #使用spark读取本地日志文件


for b in log_data.toLocalIterator(): 
    #以迭代的方式来把一条条数据读取出来进行正则匹配,并最终将 dict作为body写入到es中去
    # e='Ambari:Mar  2 02:14:16 ambari sshd[16716]: Accepted password for root from 172.21.202.174 port 59886 ssh2'#日志格式
    log_group=re.search('^(\S+):(\w{3})\s+(\d{1,2})\s(\d{2}:\d{2}:\d{2})\s(\S+)\s(\S+)\[(\d+)\]:\s(.+)',b)
    if log_group:
        year='2019'
        try:
            logtime = year+'-'+month_map[log_group.group(2)]+'-'+log_group.group(3)+' '+log_group.group(4) #将字段拼接成年月日的格式
            logtime = datetime.datetime.strptime(logtime,'%Y-%m-%d %H:%M:%S')
        except Exception as e:
           pass
        row = dict(_hostname=log_group.group(1), #将数据组成一个字典  k,v
                  syslog_timestamp=logtime,
                  hostname=log_group.group(5),
                  program=log_group.group(6),
                  pid=log_group.group(7),
                  msg = log_group.group(8))
        if re.match('^Accepted password for',row['msg']) or re.match('^Accepted publickey for',row['msg']) :

            msg_a=re.search('Accepted\s\w+\sfor\s(\S+)\sfrom\s(\d{2,3}\.\d{2,3}\.\d{2,3}\.\d{2,3})\sport\s(\d+)',row['msg'])
            row['login_success']=True
            row['login_success_msg']={'username':msg_a.group(1),'user_ip':msg_a.group(2),'user_port':msg_a.group(3)}
        es.index(index='data_log02',doc_type='test02',body=row) #将数据写入到es中去
    else:
        break

 另外一种log的处理

import datetime
from pyspark import SparkContext
from elasticsearch import Elasticsearch


sc = SparkContext.getOrCreate()
log_data = sc.textFile("/Desktop/data_doc/data_Log/utm/GX04-UTM1000D-1")
"""
一条日志的格式如下
Mar  1 00:00:08 172.21.208.21 date=2019-03-01 time=00:00:08 devname=GX04-UTM1000D-1 devid=FGT1KD3914800909 
logid=0001000014 type=traffic subtype=local level=notice vd=root srcip=195.142.115.111 srcport=54045 srcintf="port12"
 dstip=114.242.119.194 dstport=80 dstintf="root" sessionid=1013402601 status=deny policyid=0 dstcountry="China"
  srccountry="Turkey" trandisp=noop service=FortiGuard proto=6 app="Web Management" duration=0 sentbyte=0
   rcvdbyte=0 sentpkt=0
"""
es = Elasticsearch()
# 打印加载的用户信息第一条
fileds = log_data.map(lambda lines:lines.split()) #将数据按照空格来切割
print(fileds.first())
'''
操作之后的数据格式
['Mar', '1', '00:00:06', '172.21.208.21', 'date=2019-03-01', 'time=00:00:06',
 'devname=GX04-UTM1000D-1', 'devid=FGT1KD3914800909', 'logid=0001000014', 'type=traffic',
  'subtype=local', 'level=notice', 'vd=root', 'srcip=89.248.172.38', 'srcport=40462', 'srcintf="port12"',
   'dstip=114.242.119.252', 'dstport=55325', 'dstintf="root"', 'sessionid=1013402572', 'status=deny',
    'policyid=0', 'dstcountry="China"', 'srccountry="Netherlands"', 'trandisp=noop', 'service=55325/tcp',
     'proto=6', 'app=55325/tcp', 'duration=0', 'sentbyte=0', 'rcvdbyte=0', 'sentpkt=0']

'''
start = datetime.datetime.now()
#初级版本,用于处理utm的初级版本,直接读取健值对,然后将数据写入到es中去
for b in fileds.toLocalIterator():
    d = {}

    for i in b[4:]:
        j=i.split("=")
        if len(j)==2:
            k,v=j[0].strip(),j[1].strip('"')
            if k =="date":
                    v = datetime.datetime.strptime(v,"%Y-%m-%d")
            if k =="time":
                 v = datetime.datetime.strptime(v,"%H:%M:%S")
            d[k]=v
    es.index(index='data_log01',doc_type='test01',body=d) #将整理好的 k,v格式的数据作为body写入es库
end = datetime.datetime.now()

print(end-start,'这是时间')

 

posted on 2019-05-28 14:02  王大拿  阅读(3364)  评论(0编辑  收藏  举报

导航