python为火车头写插件
火车头的官方现在已支持python写插件,最开始按照官方文档安装了一个python3.8.8,调用插件总是报错,后面咨询客服说是版本太高,后面删除后python重新安装了一个python3.6,重新测试发现完美解决
贴一个写好的插件
# -*- coding: utf-8 -*-
# @Author: kaka
# @Date: 2022-03-03 09:45:11
# @Last Modified by: kaka
# @Last Modified time: 2022-03-17 11:08:08
# @Email: zhckaka@sina.com
import os
import sys
import re
import datetime
# from w3lib import html
import html
from scrapy.selector import Selector
import importlib
from urllib.parse import urljoin
from urllib import parse
import json
import requests
import emoji
import re
if len(sys.argv) != 5:
print(len(sys.argv))
print("命令行参数长度不为5")
sys.exit()
else:
LabelCookie = parse.unquote(sys.argv[1])
LabelUrl = parse.unquote(sys.argv[2])
# PageType为List,Content,Pages分别代表列表页,内容页,多页http请求处理,Save代表内容处理
PageType = sys.argv[3]
SerializerStr = parse.unquote(sys.argv[4])
if (SerializerStr[0:2] != '''{"'''):
file_object = open(SerializerStr)
try:
SerializerStr = file_object.read()
SerializerStr = parse.unquote(SerializerStr)
finally:
file_object.close()
LabelArray = json.loads(SerializerStr)
# 以下是用户编写代码区域
if(PageType == "Save"):
if(LabelArray['content_comments']):
# 提取comment-copy中的内容
comments = LabelArray['content_comments']
text_str = r'''{0}'''.format(comments)
json_data = json.loads(text_str)
model_list = json_data["features"]["comments"]["models"]
if model_list and len(model_list) > 0:
level_one = {}
# 先把一级的数据保存起来
for ml_key, ml_val in model_list.items():
user_id = ml_key
parentId = ml_val.get("parentId", "")
media_list = ml_val.get("media", {}).get(
"richtextContent", {}).get("document", [])
if not parentId:
for md in media_list:
md_c = md.get("c", [])
# 保存评论
mc_list = []
for mc in md_c:
mt_text = mc.get("t")
mc_list.append(mt_text)
# print(mt_text)
level_one[user_id] = " ".join(mc_list)
for ml_key, ml_val in model_list.items():
# print(ml_key)
user_id = ml_key
# print(user_id)
parentId = ml_val.get("parentId", "")
# print("pid",parentId)
media_list = ml_val.get("media", {}).get(
"richtextContent", {}).get("document", [])
for md in media_list:
md_c = md.get("c", [])
# 保存评论
if parentId and parentId in level_one.keys():
mc_list = []
for mc in md_c:
mt_text = mc.get("t")
mc_list.append(mt_text)
lv_one = level_one[parentId]
# new_one = ""
if "$$$" not in lv_one:
new_one = "{0}$$${1}".format(
lv_one, " ".join(mc_list))
level_one[parentId] = new_one
else:
new_one = "{0}@@@{1}".format(
lv_one, " ".join(mc_list))
level_one[parentId] = new_one
level_one_list = []
for k, v in level_one.items():
level_one_list.append(v)
comments = "&&&".join(level_one_list)
emoji_str = emoji.demojize(comments)
# comments_new = re.sub(r':(.*?):', '', emoji_str).strip()
LabelArray['content_comments'] = emoji_str
else:
LabelArray['content_comments'] = "errors"
else:
LabelArray['Html'] = '当前页面的网址为:' + LabelUrl + "\r\n页面类型为:" + PageType + \
"\r\nCookies数据为:"+LabelCookie+"\r\n接收到的数据是:" + LabelArray['Html']
# 以上是用户编写代码区域
LabelArray = json.dumps(LabelArray)
print(LabelArray)
这种方式清洗数据真是太舒服了,特此记录,python的路是越来越广了

浙公网安备 33010602011771号