测试样本多种组合的选取方法
测试样本多种组合的选取方法
1、正交实验法
需要查询正交表, 正交表对于高维度任务不太适用,采用全对偶方法。
2、转移矩阵 状态转移矩阵
生成这个对话的过程是一个“马尔科夫过程”,并且我们是对这个马尔科夫过程做了“一阶马尔科夫假设”,我们的转移矩阵就是马尔科夫过程中的状态转移概率矩阵,状态是我们的domain-intent-slot,概率就是完全平均的概率(因为每次选择都是随机选的)

马尔科夫链原理
1 转台转移矩阵
可以收敛
2 每个状态只与前面一个状态相关
P(xt+1∣⋯,xt−2,xt−1,xt)=P(xt+1∣xt)
参考:https://blog.csdn.net/bitcarmanlee/article/details/82819860
基于此方案 生成 训练数据和 测试数据。
问题: 在测试端生成测试数据时 按照此方式是否合理, 是否可以按照全排列的方式生成然后 和此方式生成一次测试。问题是怎么能区分是 转移矩阵生成的还是全排列生成的样本测试集。
具体实现代码
configs
#!/usr/bin/python
#-*-coding=utf-8-*-
# Command:
# Function:
# Input:
# Format:
# Output:
# Format:
import sys
from collections import defaultdict
print >> sys.stderr, "Loading Config..."
from transform_matrix import mode, intent_transform, intent_transform_exclude
rounds = [3, 4]
first_turn_ratio = 0.2 # 在上下文环境下出现单轮话术(即完整话术)的比例
other_for_neg_ratio = 0.8 # 选择负样本时, 使用other意图话术(即非功能点内的意图话术)的概率
raw_single_path = "/Users/guoyaning/project/dialog_data/domain_data"
single_domains = ["music", "fm", "abook", "appcontrol",
"carcontrol", "carinfo", "carmaster",
"road_condition", "map", "navigation", "setting",
"general_command", "multimediacontrol", "super_express",
"telephone", "info_inquire", "weather", "thirdparty"]
base_path = "/Users/guoyaning/project/dialog_data"
main_path = "{}/dst_data".format(base_path)
rule_path = "{}/rule_data".format(base_path)
slot_value_path = "{}/slot_meta".format(base_path)
single_turn_path = "{}/single_turn".format(main_path)
twice_turn_path = "{}/twice_turn".format(main_path)
dialog_path = "{}/dialog".format(main_path)
pure_navi_volume_file = "{}/volume_change_to_navi_query.txt".format(rule_path)
ratio_value_file = "{}/ratio.txt".format(slot_value_path)
temperature_value_file = "{}/air_temperature.txt".format(slot_value_path)
air_volume_value_file = "{}/air_volume.txt".format(slot_value_path)
position_value_file = "{}/positions.txt".format(slot_value_path)
sound_volume_value_file = "{}/sound_volume.txt".format(slot_value_path)
navi_prefer_file = "{}/navi_preference.txt".format(slot_value_path)
time_value_file = "{}/time.txt".format(slot_value_path)
geo_value_file = "{}/geo.txt".format(slot_value_path)
poi_value_file = "{}/poi.txt".format(slot_value_path)
domain_slot_mapping = {"setting": [ratio_value_file, sound_volume_value_file],
"carcontrol": [temperature_value_file, air_volume_value_file,
position_value_file, ratio_value_file],
"weather": [time_value_file, geo_value_file],
"geo_time_slot": [time_value_file, geo_value_file, poi_value_file, navi_prefer_file]}
intent2intent = {"search_car_restriction": "traffic_control"}
intent2domain = {"navigate": "navigation", "get_how_long": "navigation", "get_how_far": "navigation", "transit_poi": "navigation",
"search_road_condition": "road_condition", "traffic_control": "info_inquire",
"get_weather": "weather", "get_rain": "weather", "get_snow": "weather", "get_wind": "weather",
"get_temperature": "weather", "get_hail": "weather", "get_haze": "weather", "get_humidity": "weather"}
slot_norm_map = {"highwayfi": "highwayfirst", "avoidchar": "avoidcharge", "avoidhigh": "avoidhighway", "时间": "time", "天津": "geo", "dst": "geo",
"via": "geo", "loc": "geo", "home": "geo", "company": "geo", "date": "time"}
valid_domain_intent = {"map": ["zoom_in", "zoom_out", "zoom_min", "zoom_max"],
"navigation": ["view_route", "view_route_quit"],
"road_condition": ["start_road_condition", "exit_road_condition"],
"setting": ["mute", "unmute", "volume_up", "volume_down", "volume_min", "volume_max",
"set_volume", "navi_down", "navi_up", "navi_mute"],
"weather": ["get_weather", "get_wind", "get_temperature", "get_rain",
"get_snow", "get_humidity", "get_haze", "get_hail"],
"carcontrol": ["open_window", "close_window", "down_window", "up_window",
"open_sunroof", "close_sunroof", "up_sunroof", "down_sunroof",
"open_heat_seat", "close_heat_seat", "open_air", "close_air",
"up_air_temperature", "down_air_temperature", "up_air_volume", "down_air_volume",
"set_air_temperature", "set_air_temperature_high", "set_air_temperature_low",
"set_air_volume", "set_air_volume_high", "set_air_volume_low"],
"entertain": ["play_music"]}
# 定义功能点级别的意图映射
map_function = {"map-zoom_in", "map-zoom_out", "map-zoom_min", "map-zoom_max"}
navi_function = {"navigation-view_route", "navigation-view_route_quit"}
road_condition_function = {"road_condition-start_road_condition", "road_condition-exit_road_condition"}
setting_function = {"setting-mute", "setting-unmute", "setting-volume_up", "setting-volume_down",
"setting-volume_min", "setting-volume_max", "setting-set_volume",
"setting-navi_down", "setting-navi_up", "setting-navi_mute"}
window_function = {"carcontrol-open_window", "carcontrol-close_window",
"carcontrol-up_window", "carcontrol-down_window"}
sunroof_function = {"carcontrol-open_sunroof", "carcontrol-close_sunroof",
"carcontrol-up_sunroof", "carcontrol-down_sunroof"}
heat_seat_functions = {"carcontrol-open_heat_seat", "carcontrol-close_heat_seat"}
air_temp_functions = {"carcontrol-open_air", "carcontrol-close_air",
"carcontrol-up_air_temperature", "carcontrol-down_air_temperature",
"carcontrol-set_air_temperature", "carcontrol-set_air_temperature_high",
"carcontrol-set_air_temperature_low"}
air_volume_function = {"carcontrol-open_air", "carcontrol-close_air",
"carcontrol-up_air_volume", "carcontrol-down_air_volume",
"carcontrol-set_air_volume", "carcontrol-set_air_volume_high", "carcontrol-set_air_volume_low"}
weather_function = {"weather-get_weather", "weather-get_rain", "weather-get_snow", "weather-get_temperature",
"weather-get_haze", "weather-get_hail", "weather-get_humidity", "weather-get_wind"}
geo_time_function = {"navigation-navigate", "navigation-get_how_long", "navigation-get_how_far",
"navigation-transit_poi", "road_condition-search_road_condition", "info_inquire-traffic_control"}
geo_time_function.update(weather_function)
music_function = {"entertain-play_music"}
all_functions = [map_function, navi_function, road_condition_function, setting_function,
window_function, sunroof_function, heat_seat_functions, air_temp_functions,
air_volume_function, geo_time_function, music_function]
intent_function_mapping = defaultdict(set)
for func in all_functions:
for key in func:
intent_function_mapping[key].update(func)
# intent -> action的映射
action_intent_mapping = {"up": ["map-zoom_in", "setting-volume_up", "setting-navi_up", "carcontrol-down_window",
"carcontrol-up_sunroof", "carcontrol-up_air_temperature", "carcontrol-up_air_volume"],
"down": ["map-zoom_out", "setting-volume_down", "setting-navi_down",
"carcontrol-up_window", "carcontrol-down_sunroof", "carcontrol-down_air_temperature",
"carcontrol-down_air_volume"],
"set": ["setting-set_volume", "carcontrol-set_air_temperature", "carcontrol-set_air_volume"],
"min": ["map-zoom_min", "setting-volume_min", "carcontrol-set_air_temperature_low",
"carcontrol-set_air_volume_low"],
"max": ["map-zoom_max", "setting-volume_max", "carcontrol-set_air_temperature_high",
"carcontrol-set_air_volume_high"],
"open": ["navigation-view_route", "road_condition-start_road_condition",
"carcontrol-open_window", "carcontrol-open_sunroof",
"carcontrol-open_heat_seat", "carcontrol-open_air"],
"pause": ["navigation-view_route_quit", "road_condition-exit_road_condition",
"carcontrol-close_window", "carcontrol-close_sunroof",
"carcontrol-close_heat_seat", "carcontrol-close_air"],
"mute": ["setting-mute", "setting-navi_mute"],
"unmute": ["setting-unmute"],
"inquire": ["weather-get_weather", "weather-get_rain", "weather-get_snow",
"weather-get_temperature", "weather-get_humidity",
"weather-get_wind", "weather-get_haze", "weather-get_hail",
"info_inquire-traffic_control", "road_condition-search_road_condition",
"navigation-get_how_long", "navigation-get_how_far"],
"goto": ["navigation-navigate", "navigation-transit_poi"],
"search_play": ["entertain-positive_play"],
"other": ["other-other"]}
intent_action_mapping = {v: k for k, vlist in action_intent_mapping.iteritems() for v in vlist}
# 定义每个intent下可以存在的slot
valid_slot_keys = {"open_window": ["window_ratio", "position"], "close_window": ["window_ratio", "position"],
"open_sunroof": ["window_ratio"], "close_sunroof": ["window_ratio"],
"open_heat_seat": ["position"], "close_heat_seat": ["position"],
"up_air_temperature": ["air_degree"], "down_air_temperature": [""]}
# 定义每个domain-intent下在构建query池的索引key(domain-intent-slot_keys)时忽略的slot
slot_as_key_ignored = {"default": ["category"],
"navigation-navigate": ["avoidhighway", "avoidcharge", "avoidjam", "highwayfirst", "category"]}
# intent_strong_dependency = ["同样", "一样", "同上"]
all_position = ["全部", "全都", "全部都", "所有", "全车"]
norm_position = {ln.strip().split("\001")[0]: ln.strip().split("\001")[2] for ln in open(position_value_file, "r")
if len(ln.strip().split("\001")) == 3 and ln.strip().split("\001")[2]}
# 按概率进行槽位替换的值, 比如"一点", 如果不设置按概率替换, 可能全部被替换成具体的比例, 导致失去"再大一点"这种语料
random_replace_value = {"volume_up": {"一点", "一些", "一丝", "点儿", "点", "些"},
"volume_down": {"一点", "一些", "一丝", "点儿", "点", "些"},
"navi_up": {"一点", "一些", "一丝", "点儿", "点", "些"},
"navi_down": {"一点", "一些", "一丝", "点儿", "点", "些"}}
# 单纯转导航音量的query(比如"我是说导航音量"), 不涉及音量大小的变化
pure_navi_volume_query = {ln.strip() for ln in open(pure_navi_volume_file, "r")}
navi_intents = {"navi_up", "navi_down", "navi_mute"}
# 两轮之间必定发生变化的意图
change_intents = {"mute", "unmute", "volume_max", "volume_min", "navi_mute"}
# 不进行slot replace操作的域
intent_no_replace = {"up_window", "down_window", "up_sunroof", "down_sunroof",
"set_air_temperature_high", "set_air_temperature_low",
"set_air_volume_high", "set_air_volume_low", "mute", "unmute",
"volume_min", "volume_max"}
# 定义域认定词, 用于category提取
music_domain_query = {"歌", "歌儿", "曲", "曲儿", "曲目", "歌曲", "音乐", "曲子", "小曲", "小曲儿", "轻音乐", "民谣",
"摇滚", "电音", "mv", "催眠曲", "睡眠曲", "乐", "儿歌", "童谣", "电子音", "电子琴", "dj", "迪斯科",
"古琴", "独奏", "串烧", "的士高", "萨克斯", "葫芦丝", "古筝", "二胡", "电子音", "情歌"}
abook_domain_query = {"直播", "相声", "小品", "相声小品", "脱口秀", "小说", "新闻", "游戏动漫", "戏曲", "畅销书", "故事",
"诗歌", "党团课", "公开课", "广播剧", "京剧", "豫剧", "节目", "专辑", "滚动头条", "今日头条", "头条",
"电视剧", "自媒体", "直播", "相声", "小品", "评书", "脱口秀", "散文", "小说", "段子", "游戏动漫",
"戏曲", "畅销书", "河南戏", "河南坠子", "故事", "黄梅戏", "诗歌", "诗词", "唐诗", "宋词", "文学", "古诗",
"资讯", "党团课", "二人转", "公开课", "动画片", "景点", "广播剧", "健康养生", "养生", "健康", "京剧",
"晋剧", "豫剧", "名著", "昆剧", "湘剧", "评剧", "蒲剧", "潮剧", "话剧", "川戏", "川剧", "喜剧", "秦腔",
"歌剧", "童话", "讲座", "越剧", "昆曲", "花鼓戏", "粤剧", "剧", "有声读物", "有声书", "专辑", "节目",
"笑话", "读物", "声音", "头条", "播客", "有声", "书", "多媒体"}
fm_domain_query = {"广播", "广播台", "广播电台", "频道", "电台", "fm电台", "调频电台", "fm", "收音机", "调频", "频率",
"台", "电台节目", "中波", "广播剧", "调幅"}
navi_domain_query = {"高速优先", "全程高速", "上高速", "高速路线", "高速模式", "导航", "电话",
"拥堵", "偏好", "中途点", "途经点", "地图", "路线", "高速",
"躲避", "避免", "车头", "距离", "线路", "实时路况", "实时路况播报", "路况",
"目的地", "出发地", "公路", "全程", "交通", "路段", "路程",
"优先", "播报", "应用", "导游航", "系统", "程序", "指令", "语音", "高德",
"仪表盘", "屏幕", "途经点", "路口", "道口", "岔口", "交叉口", "十字路口", "位置", "全程", "3d", "2d"}
carcontrol_domain_query = {"空调", "温度", "空调温度", "室温",
"空调风量", "空调风力", "空调风速", "风量", "风力", "风速",
"车窗", "窗户", "玻璃", "天窗", "座椅加热", "座位加热", "位子加热", "加热"}
setting_domain_query = {"音量", "声音", "导航", "导航音量", "声", "音", "导航播报", "路况播报"}
weather_domain_query = {"天气", "气候", "空气", "空气质量", "雾霾", "霾", "冰雹", "雹子",
"下雨", "有雨", "雨水", "雨天", "降雨", "雨量",
"下雪", "有雪", "降雪", "雪天", "雪量",
"风力", "刮风", "级风", "大风", "台风", "刮大风", "风速", "刮大风",
"温度", "空气温度", "气温", "湿度", "空气湿度", "潮湿"}
traffic_control_domain_query = {"限行", "限号"}
domain_query_mapping = {"music": music_domain_query, "abook": abook_domain_query, "fm": fm_domain_query,
"map": navi_domain_query, "navigation": navi_domain_query,
"road_condition": navi_domain_query, "setting": setting_domain_query,
"carcontrol": carcontrol_domain_query, "weather": weather_domain_query,
"info_inquire": traffic_control_domain_query}
# 定义模型与业务slot的映射
slot_mapping = {"air_degree": "degree", "value": "degree", "volume_degree": "degree", "window_ratio": "degree",
"category": "category", "position": "position", "geo": "geo", "time": "time", "artist": "artist", "person": "artist",
"avoidcharge": "avoidcharge", "avoidhighway": "avoidhighway", "avoidjam": "avoidjam", "highwayfirst": "highwayfirst",
"frequency": "keyword", "program": "keyword", "album": "keyword", "keyword": "keyword", "radio": "keyword"}
# 定义belief state中slot的映射关系
# belief state作为模型的输入, 需要与线上context manager中存储的最终slot对应
# 比如navigate意图, 业务上地点槽位是location, 需要将该意图下的地点槽位由geo映射为location
state_slot_mapping = {"navigation-navigate": {"geo": "location", "highwayfirst": "navigate_preference", "avoidjam": "navigate_preference",
"avoidcharge": "navigate_preference", "avoidhighway": "navigate_preference"},
"navigation-get_how_long": {"geo": "location"},
"navigation-get_how_far": {"geo": "location"},
"road_condition-search_road_condition": {"geo": "location"},
"info_inquire-traffic_control": {"geo": "loc", "time": "date"}}
# 定义转移矩阵
#air_volume_keys = ["carcontrol-up_air_volume-#", "carcontrol-up_air_volume-volume_degree",
# "carcontrol-down_air_volume-#", "carcontrol-down_air_volume-volume_degree",
# "carcontrol-set_air_volume-volume_degree", "carcontrol-set_air_volume_high-#",
# "carcontrol-set_air_volume_high-volume_degree", "carcontrol-set_air_volume_low-#",
# "carcontrol-set_air_volume_low-volume_degree"]
#air_temperature_keys = ["carcontrol-up_air_temperature-#", "carcontrol-up_air_temperature-air_degree",
# "carcontrol-down_air_temperature-#", "carcontrol-down_air_temperature-air_degree",
# "carcontrol-set_air_temperature-air_degree", "carcontrol-set_air_temperature_high-#",
# "carcontrol-set_air_temperature_high-air_degree", "carcontrol-set_air_temperature_low-#",
# "carcontrol-set_air_temperature_low-air_degree"]
#heat_seat_keys = ["carcontrol-open_heat_seat-#", "carcontrol-open_heat_seat-position",
# "carcontrol-close_heat_seat-#", "carcontrol-close_heat_seat-position"]
#
#window_keys = ["carcontrol-open_window-#", "carcontrol-open_window-window_ratio",
# "carcontrol-open_window-position", "carcontrol-open_window-position#window_ratio",
# "carcontrol-close_window-#", "carcontrol-close_window-window_ratio",
# "carcontrol-close_window-position", "carcontrol-close_window-position#window_ratio",
# "carcontrol-up_window-#", "carcontrol-up_window-window_ratio",
# "carcontrol-up_window-position", "carcontrol-up_window-position#window_ratio",
# "carcontrol-down_window-#", "carcontrol-down_window-window_ratio",
# "carcontrol-down_window-position", "carcontrol-down_window-position#window_ratio"]
#
#sunroof_keys = ["carcontrol-open_sunroof-#", "carcontrol-open_sunroof-window_ratio",
# "carcontrol-close_sunroof-#", "carcontrol-close_sunroof-window_ratio",
# "carcontrol-up_sunroof-#", "carcontrol-up_sunroof-window_ratio",
# "carcontrol-down_sunroof-#", "carcontrol-down_sunroof-window_ratio"]
#
#sound_value_keys = ["setting-volume_up-#", "setting-volume_up-value",
# "setting-volume_down-#", "setting-volume_down-value",
# "setting-volume_min-#", "setting-volume_min-value",
# "setting-volume_max-#", "setting-volume_max-value",
# "setting-set_volume-value", "setting-mute-#"]
#
#if mode == "dst":
# sound_value_keys = ["setting-volume_down-value", "setting-volume_up-value",
# "setting-volume_max-value", "setting-volume_min-value",
# "setting-set_volume-value", "setting-mute-category", "setting-unmute-category"]
#
#get_humidity_value_keys = ["weather-get_humidity-#", "weather-get_humidity-geo",
# "weather-get_humidity-time", "weather-get_humidity-geo#time"]
#
#get_snow_value_keys = ["weather-get_snow-#", "weather-get_snow-geo",
# "weather-get_snow-time", "weather-get_snow-geo#time"]
#
#get_hail_value_keys = ["weather-get_hail-#", "weather-get_hail-geo",
# "weather-get_hail-time", "weather-get_hail-geo#time"]
#
#get_wind_value_keys = ["weather-get_wind-#", "weather-get_wind-geo",
# "weather-get_wind-time", "weather-get_wind-geo#time"]
#
#carcontrol_intent_transform = {"carcontrol-up_air_volume-volume_degree": set(air_volume_keys),
# "carcontrol-down_air_volume-volume_degree": set(air_volume_keys),
# "carcontrol-set_air_volume_high-volume_degree": set(air_volume_keys),
# "carcontrol-set_air_volume_low-volume_degree": set(air_volume_keys),
# "carcontrol-up_air_temperature-air_degree": set(air_temperature_keys),
# "carcontrol-down_air_temperature-air_degree": set(air_temperature_keys),
# "carcontrol-set_air_temperature_high-air_degree": set(air_temperature_keys),
# "carcontrol-set_air_temperature_low-air_degree": set(air_temperature_keys),
# "carcontrol-up_sunroof-window_ratio": set(sunroof_keys),
# "carcontrol-down_sunroof-window_ratio": set(sunroof_keys),
# "carcontrol-up_window-window_ratio": set(window_keys),
# "carcontrol-up_window-position#window_ratio": set(window_keys),
# "carcontrol-down_window-window_ratio": set(window_keys),
# "carcontrol-down_window-position#window_ratio": set(window_keys)}
#
#setting_intent_transform = {"setting-volume_up-value": set(sound_value_keys),
# "setting-volume_down-value": set(sound_value_keys),
# "setting-volume_max-value": set(sound_value_keys),
# "setting-volume_min-value": set(sound_value_keys),
# "setting-mute-category": }
#
#weather_intent_transform = {"weather-get_humidity-geo#time": set(get_humidity_value_keys),
# "weather-get_wind-geo": set(get_wind_value_keys),
# "weather-get_hail-geo": set(get_hail_value_keys),
# "weather-get_snow-geo": set(get_snow_value_keys)}
#
#intent_transform = {"carcontrol": carcontrol_intent_transform,
# "weather": weather_intent_transform}
# #"setting": setting_intent_transform,
#
#intent_transform_exclude = {"carcontrol-set_air_volume_high-#": {air_volume_keys[i] for i in [0, 1, 5, 6]},
# "carcontrol-set_air_volume_high-volume_degree": {air_volume_keys[i] for i in [0, 1, 5, 6]},
# "carcontrol-set_air_volume_low-#": {air_volume_keys[i] for i in [2, 3, 7, 8]},
# "carcontrol-set_air_volume_low-volume_degree": {air_volume_keys[i] for i in [2, 3, 7, 8]},
# "carcontrol-set_air_temperature_high-#": {air_temperature_keys[i] for i in [0, 1, 5, 6]},
# "carcontrol-set_air_temperature_high-air_degree": {air_temperature_keys[i] for i in [0, 1, 5, 6]},
# "carcontrol-set_air_temperature_low-#": {air_temperature_keys[i] for i in [2, 3, 7, 8]},
# "carcontrol-set_air_temperature_low-air_degree": {air_temperature_keys[i] for i in [2, 3, 7, 8]},
# "carcontrol-open_heat_seat-#": {heat_seat_keys[0]},
# "carcontrol-open_heat_seat-position": {heat_seat_keys[0]},
# "carcontrol-close_heat_seat-#": {heat_seat_keys[2]},
# "carcontrol-close_heat_seat-position": {heat_seat_keys[2]},
# "carcontrol-up_air_volume-#": {air_volume_keys[6]},
# "carcontrol-up_air_volume-volume_degree": {air_volume_keys[6]},
# "carcontrol-down_air_volume-#": {air_volume_keys[5]},
# "carcontrol-down_air_volume-volume_degree": {air_volume_keys[5]},
# "carcontrol-up_window-window_ratio": {window_keys[0]},
# "carcontrol-up_window-position#window_ratio": {window_keys[0]},
# "carcontrol-down_window-window_ratio": {window_keys[0]},
# "carcontrol-down_window-position#window_ratio": {window_keys[0]},
# "carcontrol-up_sunroof-window_ratio": {sunroof_keys[0]},
# "carcontrol-down_sunroof-window_ratio": {sunroof_keys[0]}}
## "setting-volume_min-#": {sound_value_keys[i] for i in [2, 3, 4, 5]},
## "setting-volume_min-value": {sound_value_keys[i] for i in [2, 3, 4, 5]},
## "setting-volume_max-#": {sound_value_keys[i] for i in [0, 1, 6, 7]},
## "setting-volume_max-value": {sound_value_keys[i] for i in [0, 1, 6, 7]},
## "setting-mute-#": {sound_value_keys[i] for i in [2, 3, 4, 5, 9]},
## "setting-mute-value": {sound_value_keys[i] for i in [2, 3, 4, 5, 9]}}
#
#
generator_dialog
#!/usr/bin/python
#-*-coding=utf-8-*-
# Command:
# Function:
# Input:
# Format:
# Output:
# Format:
import os
import numpy as np
from math import ceil
from utils import *
def get_input_filename(file_path):
if mode == "dst" and os.path.isfile("{}/dataset.dst".format(file_path)):
return "dataset.dst"
if os.path.isfile("{}/dataset.ctx".format(file_path)):
return "dataset.ctx"
return None
def get_positive_filename():
return "positive.dst" if mode == "dst" else "positive.ctx"
def get_negative_filename():
return "negative.dst" if mode == "dst" else "negative.ctx"
def load_first_turn_info():
turn_info = defaultdict(set)
for dirname in os.listdir(raw_single_path):
if dirname not in single_domains:
continue
file_path = "{}/{}".format(raw_single_path, dirname)
if not os.path.exists(file_path) or not os.path.isdir(file_path):
continue
filename = get_input_filename(file_path)
if not filename:
print >> sys.stderr, "No dataset.txt OR dataset.dst in file path: {}".format(file_path)
continue
for ln in open("{}/{}".format(file_path, filename), "r"):
info = ln.strip().split("\001")
if len(info) != 5:
continue
query, domain, pmintent, intent, slots = info[:]
if domain in ["music", "fm", "abook"]:
domain = "entertain"
if "play_music" in intent:
intent = "play_music"
if "play_abook" in intent:
intent = "play_abook"
if "play_fm" in intent:
intent = "play_fm"
domain_intent = "-".join([domain, intent])
if slots:
slot_keys = set([kv.split(":")[0].lower() for kv in slots.split("|")])
slot_keys = set([slot_norm_map[k] if k in slot_norm_map else k for k in slot_keys])
if domain_intent in slot_as_key_ignored:
slot_keys = set([k for k in slot_keys if k not in slot_as_key_ignored[domain_intent]])
slot_keys = "#".join(slot_keys)
else:
slot_keys = "#"
#if "-".join([domain, intent]) in intent_function_mapping:
if domain_intent in intent_function_mapping:
key = "-".join([domain, intent, slot_keys])
else:
key = "other"
turn_info[key].add((query, slots))
return {k: list(v) for k, v in turn_info.iteritems()}
def load_twice_turn_info(target_domain):
"""
根据两轮数据集, 生成第二轮话术集合和intent-slot转移矩阵
slot_twice_turn_queries: query池, 只有当前轮与上一轮意图相同时, 才从该池子中选query
intent_twice_turn_queries: query池, 只有当前轮与上一轮意图不同时, 才从该池子中选query
"""
slot_twice_turn_queries, intent_twice_turn_queries = defaultdict(set), defaultdict(set)
transform_matrix = defaultdict(set)
file_path = "{}/{}".format(twice_turn_path, target_domain)
filename = get_input_filename(file_path)
if not filename:
print >> sys.stderr, "No dataset.ctx OR dataset.dst in file path: {}".format(file_path)
return None
for ln in open("{}/{}".format(file_path, filename)):
info = ln.strip().split("\001")
if len(info) != 2:
continue
query_1_info, query_2_info = info[0].split("\002"), info[1].split("\002")
if len(query_1_info) != 4 or len(query_2_info) != 4:
continue
query_1, domain_1, intent_1, slots_1 = query_1_info[:]
query_2, domain_2, intent_2, slots_2 = query_2_info[:]
domain_intent_1 = "-".join([domain_1, intent_1])
domain_intent_2 = "-".join([domain_2, intent_2])
if slots_1:
slot_keys_1 = set([kv.split(":")[0].lower() for kv in slots_1.split("|")])
slot_keys_1 = set([slot_norm_map[k] if k in slot_norm_map else k for k in slot_keys_1])
if domain_intent_1 in slot_as_key_ignored:
slot_keys_1 = set([k for k in slot_keys_1 if k not in slot_as_key_ignored[domain_intent_1]])
slot_keys_1 = "#".join(slot_keys_1)
else:
slot_keys_1 = "#"
if slots_2:
slot_keys_2 = set([kv.split(":")[0].lower() for kv in slots_2.split("|")])
slot_keys_2 = set([slot_norm_map[k] if k in slot_norm_map else k for k in slot_keys_2])
if domain_intent_2 in slot_as_key_ignored:
slot_keys_2 = set([k for k in slot_keys_2 if k not in slot_as_key_ignored[domain_intent_2]])
slot_keys_2 = "#".join(slot_keys_2)
else:
slot_keys_2 = "#"
key_1 = "-".join([domain_1, intent_1, slot_keys_1])
key_2 = "-".join([domain_2, intent_2, slot_keys_2])
if intent_1 == intent_2:
slot_twice_turn_queries[key_2].add((query_2, slots_2))
else:
intent_twice_turn_queries[key_2].add((query_2, slots_2))
transform_matrix[key_1].add(key_2)
# 白名单&黑名单
if target_domain in intent_transform:
for key, values in intent_transform[target_domain].iteritems():
transform_matrix[key].update(values)
if target_domain in intent_transform_exclude:
for key, values in intent_transform_exclude[target_domain].iteritems():
if key in transform_matrix:
transform_matrix[key] -= values
slot_twice_turn_queries = {k: list(v) for k, v in slot_twice_turn_queries.iteritems()}
intent_twice_turn_queries = {k: list(v) for k, v in intent_twice_turn_queries.iteritems()}
transform_matrix = {k: list(v) for k, v in transform_matrix.iteritems()}
return slot_twice_turn_queries, intent_twice_turn_queries, transform_matrix
def generator(target_domain, dialog_num):
print "Loading First Turn Info..."
first_turn_info = load_first_turn_info()
print "Loading Twice Turn Info..."
slot_twice_turn_info, intent_twice_turn_info, transform_matrix = load_twice_turn_info(target_domain)
#print_internal_log(first_turn_info, intent_twice_turn_info, slot_twice_turn_info, transform_matrix)
turn_query_info = (first_turn_info, slot_twice_turn_info, intent_twice_turn_info)
print "Generating Dialogues..."
slot_value_pool = {}
for d, files in domain_slot_mapping.iteritems():
slot_values = load_slot_values(files)
slot_value_pool[d] = slot_values
if target_domain not in slot_value_pool:
slot_value_pool[target_domain] = {}
min_max_length = {}
for d in domain_query_mapping:
lengths = [len(v.decode("utf-8")) for v in domain_query_mapping[d]]
min_max_length[d] = (min(lengths), max(lengths))
domain_query = set()
if target_domain in domain_query_mapping:
domain_query = domain_query_mapping[target_domain]
query_substr_pool = {}
covered_functions = {v for fun in all_functions for v in fun}
# print "*" * 40
# for k in all_function_values:
# print k
# print "*" * 40
# for k in domain_function_values:
# print k
output_path = "{}/{}".format(dialog_path, target_domain)
if not os.path.exists(output_path) or not os.path.isdir(output_path):
os.makedirs(output_path)
positive_f = open("{}/{}".format(output_path, get_positive_filename()), "w")
negative_f = open("{}/{}".format(output_path, get_negative_filename()), "w")
value_map = {}
trans_cases = defaultdict(list)
turn_num = 0
loop_num = ceil(float(dialog_num)/len(transform_matrix))
for i in range(int(loop_num)):
round_num = random.choice(rounds)
# key_1 = random.choice(transform_matrix.keys())
# while key_1 not in first_turn_info:
# key_1 = random.choice(transform_matrix.keys())
for key_1 in transform_matrix:
if key_1 not in first_turn_info:
continue
key_info_1 = key_1.split("-")
if len(key_info_1) != 3:
continue
domain_1, intent_1, slot_keys_1 = key_info_1[:]
query_1, slots_1_str = random.choice(first_turn_info[key_1])
slots_1 = []
if slots_1_str:
slots_1 = [(kv.split(":")[0].lower(), kv.split(":")[1]) for kv in slots_1_str.split("|")]
slots_1 = [(slot_norm_map[k] if k in slot_norm_map else k, v) for k, v in slots_1]
query_1, slots_1, value_map = slot_replace(slot_value_pool[target_domain], query_1, intent_1, slots_1, value_map)
# 生成query_1的category槽位
domain_query = set()
min_len, max_len = 0, 0
if domain_1 in min_max_length:
min_len, max_len = min_max_length[domain_1]
if domain_1 in domain_query_mapping:
domain_query = domain_query_mapping[domain_1]
if "category" not in set([kv[0] for kv in slots_1]):
category_slots_1, query_substr_pool = extract_category_slot(domain_1, intent_1, query_1, query_substr_pool,
domain_query, min_len, max_len)
slots_1 += category_slots_1
intent_1 = intent_mapping(intent_1, slots_1)
# 生成多轮功能点内正样本对话
pre_key = key_1
pre_state = (domain_1, intent_1, slots_1)
pos_turns = []
for i in range(round_num):
if pre_key not in transform_matrix:
print >> sys.stderr, "{} not in transform_matrix".format(pre_key)
continue
key_2, query_2, domain_2, intent_2, slots_2, value_map = \
generate_pos_turn(pre_key, pre_state, turn_query_info, transform_matrix, slot_value_pool[target_domain], value_map)
# 生成query_2的category槽位
domain_query = set()
min_len, max_len = 0, 0
if domain_2 in min_max_length:
min_len, max_len = min_max_length[domain_2]
if domain_2 in domain_query_mapping:
domain_query = domain_query_mapping[domain_2]
if "category" not in set([kv[0] for kv in slots_2]):
category_slots_2, query_substr_pool = extract_category_slot(domain_2, intent_2, query_2, query_substr_pool,
domain_query, min_len, max_len)
slots_2 += category_slots_2
state_slots = merge_slots(pre_state, (domain_2, intent_2, slots_2))
intent_2 = intent_mapping(intent_2, state_slots)
pos_turns.append((query_2, domain_2, intent_2, slots_2, state_slots))
#state_keys = set([kv[0] for kv in state_slots if kv[0] in ["geo", "time"]])
ignored_slot_keys = slot_as_key_ignored["default"]
domain_intent_2 = "-".join([domain_2, intent_2])
if domain_intent_2 in slot_as_key_ignored:
ignored_slot_keys = slot_as_key_ignored[domain_intent_2]
state_keys = set([kv[0] for kv in state_slots if kv[0] not in ignored_slot_keys])
# if domain_2 == "navigation" and not state_keys:
# print "FIRST", "\t", ";".join([domain_1, intent_1, "|".join([":".join([k, v]) for k, v in slots_1])])
# print "PRE", "\t", pre_key
# print "CURRENT", "\t", ";".join([domain_2, intent_2, "|".join([":".join([k, v]) for k, v in slots_2])])
# print "MERGED", "\t", "|".join([":".join([k, v]) for k, v in state_slots])
pre_key = "-".join([domain_2, intent_2, "#".join(state_keys) if state_keys else "#"])
# if pre_key == "setting-set_volume-#":
# output_slots_2 = ";".join([":".join([k, v]) for k, v in slots_2])
# output_state_2 = ";".join([":".join([k, v]) for k, v in state_slots])
# print query_2, domain_2, intent_2, output_slots_2, output_state_2
#pre_key = key_2
pre_state = (domain_2, intent_2, state_slots)
# 生成正样本
slot_1_str = "|".join([":".join([k, v]) for k, v in slots_1])
first_turn = "\002".join([query_1, domain_1, intent_1, intent_action_mapping["-".join([domain_1, intent_1])],
slot_1_str, slot_1_str])
turns_output = ["\002".join([q, d, i, intent_action_mapping["-".join([d, i])],
"|".join([":".join([k, v]) for k, v in sl]),
"|".join([":".join([k, v]) for k, v in st])]) for q, d, i, sl, st in pos_turns]
positive_f.write("{}\001{}\n".format(first_turn, "\001".join(turns_output)))
# 生成负样本
neg_turns, value_map = generate_neg_turn(key_1, first_turn_info, covered_functions, slot_value_pool, value_map)
for query_neg, domain_neg, intent_neg, slots_neg in neg_turns:
domain_query_neg = set()
if domain_neg in domain_query_mapping:
domain_query_neg = domain_query_mapping[domain_neg]
min_len, max_len = 0, 0
if domain_neg in min_max_length:
min_len, max_len = min_max_length[domain_neg]
if "category" not in set([kv[0] for kv in slots_neg]):
category_slots_neg, _ = extract_category_slot(domain_neg, intent_neg, query_neg,
query_substr_pool, domain_query_neg, min_len, max_len)
slots_neg += category_slots_neg
intent_neg = intent_mapping(intent_neg, slots_neg)
trans_key = "{}->{}".format("-".join([domain_1, intent_1]), "-".join([domain_neg, intent_neg]))
trans_cases[trans_key].append((query_1, slots_1, query_neg, slots_neg))
# 由本功能点转移到其他功能点(包括other)的转移种类数, 即负样本种类数
trans_num = len(trans_cases)
# 生成正样本的轮数
pos_turn_num = dialog_num * np.mean(rounds)
# 平均每种负样本的数据量
num_per_trans = pos_turn_num / trans_num
for trans_key, trans_values in trans_cases.iteritems():
key_1, key_2 = trans_key.split("->")
domain_1, intent_1 = key_1.split("-")
domain_2, intent_2 = key_2.split("-")
sample_num = min(int(ceil(num_per_trans)), len(trans_values))
out_values = random.sample(trans_values, sample_num)
for value in out_values:
query_1, slots_1, query_2, slots_2 = value
slots_1_str = "|".join([":".join([k, v]) for k, v in slots_1])
slots_2_str = "|".join([":".join([k, v]) for k, v in slots_2])
first_turn_info = "\002".join([query_1, domain_1, intent_1, intent_action_mapping[key_1],
slots_1_str, slots_1_str])
neg_turn_info = "\002".join([query_2, domain_2, intent_2, intent_action_mapping[key_2],
slots_2_str, slots_2_str])
negative_f.write("{}\001{}\n".format(first_turn_info, neg_turn_info))
"""
def generator_entertain(target_domain):
print "Loading First Turn Info..."
first_turn_info = load_first_turn_info()
domain_query = set()
if target_domain in domain_query_mapping:
domain_query = domain_query_mapping[target_domain]
slot_value_pool = {}
for d, files in domain_slot_mapping.iteritems():
slot_values = load_slot_values(files)
slot_value_pool[d] = slot_values
min_len, max_len = 0, 0
if domain_query:
lengths = [len(v.decode("utf-8")) for v in domain_query]
min_len, max_len = min(lengths), max(lengths)
query_substr_pool = {}
if target_domain in ["music", "abook", "fm"]:
target_domain = "entertain"
output_path = "{}/{}".format(dialog_path, target_domain)
if not os.path.exists(output_path) or not os.path.isdir(output_path):
os.makedirs(output_path)
positive_f = open("{}/{}".format(output_path, get_positive_filename()), "w")
negative_f = open("{}/{}".format(output_path, get_negative_filename()), "w")
print "Generating Dialogues..."
file_path = "{}/{}".format(twice_turn_path, target_domain)
filename = get_input_filename(file_path)
if not filename:
print >> sys.stderr, "No dataset.txt OR dataset.dst in file path: {}".format(file_path)
return None
for ln in open("{}/{}".format(file_path, filename)):
info = ln.strip().split("\001")
if len(info) != 2:
continue
query_1_info, query_2_info = info[0].split("\002"), info[1].split("\002")
if len(query_1_info) != 5 or len(query_2_info) != 5:
continue
query_1, domain_1, intent_1, action_1, slots_1_str = query_1_info[:]
query_2, domain_2, intent_2, action_2, slots_2_str = query_2_info[:]
slots_1, slots_2 = [], []
if slots_1_str:
slots_1 = [(kv.split(":")[0], kv.split(":")[1]) for kv in slots_1_str.split("|")]
if slots_2_str:
slots_2 = [(kv.split(":")[0], kv.split(":")[1]) for kv in slots_2_str.split("|")]
if "category" not in set([kv[0] for kv in slots_1]):
category_slots_1, query_substr_pool = extract_category_slot(domain_1, intent_1, query_1, query_substr_pool,
domain_query, min_len, max_len)
slots_1 += category_slots_1
if "category" not in set([kv[0] for kv in slots_2]):
category_slots_2, query_substr_pool = extract_category_slot(domain_2, intent_2, query_2, query_substr_pool,
domain_query, min_len, max_len)
slots_2 += category_slots_2
slots_1_str = "|".join([":".join([k, v]) for k, v in slots_1])
info_1 = "\002".join([query_1, domain_1, intent_1, action_1, slots_1_str, slots_1_str])
if intent_2 in ["play_music", "play_abook", "play_fm"]:
intent_2 = "positive_play"
slots_merge = entertain_slot_merge(intent_1, slots_1, intent_2, slots_2)
slots_2_str = "|".join([":".join([k, v]) for k, v in slots_2])
slots_merge_str = "|".join([":".join([k, v]) for k, v in slots_merge])
info_2 = "\002".join([query_2, domain_2, intent_2, action_2, slots_2_str, slots_merge_str])
positive_f.write("{}\001{}\n".format(info_1, info_2))
trans_pre_key = "-".join([domain_1, intent_1])
query_trans, domain_trans, intent_trans, slots_trans, _ = generate_trans_turn(trans_pre_key, first_turn_info, slot_value_pool, {})
trans_domain_query = set()
if domain_trans in domain_query_mapping:
domain_query_trans = domain_query_mapping[domain_trans]
if slots_trans:
if "category" not in set([kv[0] for kv in slots_trans]):
category_slots_trans, _ = extract_category_slot(domain_trans, intent_trans, query_trans,
query_substr_pool, domain_query_trans, min_len, max_len)
slots_trans += category_slots_trans
slots_trans_str = "|".join([":".join([k, v]) for k, v in slots_trans])
info_trans = "\002".join([query_trans, domain_trans, intent_trans, intent_action_mapping["-".join([domain_trans, intent_trans])],
slots_trans_str, slots_trans_str])
negative_f.write("{}\001{}\n".format(info_1, info_trans))
query_neg, domain_neg, intent_neg, slots_neg = \
generate_neg_turn(first_turn_info)
slots_neg_str = "|".join([":".join([k, v]) for k, v in slots_neg])
info_neg = "\002".join([query_neg, domain_neg, intent_neg, "other", slots_neg_str, slots_neg_str])
negative_f.write("{}\001{}\n".format(info_1, info_neg))
"""
def generate_pos_turn(pre_key, pre_state, turn_query_info, transform_matrix, slot_value_pool, value_map):
first_turn_query, slot_twice_turn_query, intent_twice_turn_query = turn_query_info
pre_domain, pre_intent, pre_slot_keys = pre_key.split("-")
# 选择正确的状态转移case
while True:
# 根据上一轮的dialog state获取当前轮的domain-intent-slots
cur_key = random.choice(transform_matrix[pre_key])
key_info = cur_key.split("-")
if len(key_info) != 3:
continue
domain, intent, slot_keys = key_info[:]
# 选择合适的当前轮query(主要针对系统设置域的导航音量调整的query)
get_valid = True
while True:
if cur_key not in first_turn_query or random.random() < 1-first_turn_ratio:
# 对于"副驾的"这类case, 有可能是open_heat_seat-position, 也有可能是close_heat_seat-position
# 必须与上一轮意图一致, 如果直接从二轮话术中选择, 会与上一轮意图不一致
# 所以这类case只放在slot变化(intent不变化)的二轮话术中, 并只在intent不改变的时候可能选择到
if pre_intent == intent and intent not in change_intents:
if cur_key not in slot_twice_turn_query:
print >> sys.stderr, "Query Not Found in slot_twice_turn_query. cur_key: {}; keys: {}".format(
cur_key, ",".join(slot_twice_turn_query.keys()))
get_valid = False
break
query, slots_str = random.choice(slot_twice_turn_query[cur_key])
else:
if cur_key not in intent_twice_turn_query:
print >> sys.stderr, "Query Not Found in intent_twice_turn_query. cur_key: {}; keys: {}".format(
cur_key, ",".join(intent_twice_turn_query.keys()))
get_valid = False
break
query, slots_str = random.choice(intent_twice_turn_query[cur_key])
else:
query, slots_str = random.choice(first_turn_query[cur_key])
if valid_query_transform(pre_state, (query, domain, intent)):
break
# else:
# pre_slot_str = ";".join([":".join([k, v]) for k, v in pre_state[2]])
# pre_output = ",".join([pre_state[0], pre_state[1], pre_slot_str])
# print >> sys.stderr, "Bad Query Transform: ", pre_output, "-".join([query, domain, intent])
if not get_valid:
break
slots = []
if slots_str:
slots = [(kv.split(":")[0].lower(), kv.split(":")[1]) for kv in slots_str.split("|")]
slots = [(slot_norm_map[k] if k in slot_norm_map else k, v) for k, v in slots]
query, slots, value_map = slot_replace(slot_value_pool, query, intent, slots, value_map)
if valid_state_transform(pre_state, (query, domain, intent, slots)):
break
# else:
# pre_slot_str = ";".join([":".join([k, v]) for k, v in pre_state[2]])
# pre_output = ",".join([pre_state[0], pre_state[1], pre_slot_str])
# print >> sys.stderr, "Bad State Transform: ", pre_output, "-".join([query, domain, intent])
return cur_key, query, domain, intent, slots, value_map
def generate_neg_turn(pre_key, first_turn_info, all_function_values, slot_value_pool, value_map):
first_turn_key_map = defaultdict(list)
for k in first_turn_info:
first_turn_key_map[k.rsplit("-", 1)[0]].append(k)
pre_domain, pre_intent = pre_key.split("-")[:2]
valid_values = intent_function_mapping["-".join([pre_domain, pre_intent])]
target_candidates = all_function_values - valid_values
neg_turns = []
for key in target_candidates:
if key not in first_turn_key_map:
print >> sys.stderr, "{} not in first_turn_key_map".format(key)
continue
target_key = random.choice(first_turn_key_map[key])
query, slots_str = random.choice(first_turn_info[target_key])
domain, intent = target_key.split("-")[:2]
slots = []
if slots_str:
slots = [(kv.split(":")[0], kv.split(":")[1]) for kv in slots_str.split("|")]
slot_values = slot_value_pool[domain] if domain in slot_value_pool else {}
query, slots, value_map = slot_replace(slot_values, query, intent, slots, value_map)
neg_turns.append((query, domain, intent, slots))
query, slot_str = random.choice(first_turn_info["other"])
neg_turns.append((query, "other", "other", []))
return neg_turns, value_map
def entertain_slot_merge(pre_intent, pre_slots, cur_intent, cur_slots):
cur_keys = set([kv[0] for kv in cur_slots])
merged_slots = []
if pre_intent == "play_music":
if cur_intent in ["play_abook", "play_fm", "play_music"]:
merged_slots = cur_slots
if cur_intent == "play_music":
for key, value in pre_slots:
if key not in cur_keys:
merged_slots.append((key, value))
elif cur_intent == "negative_play":
for key, value in pre_slots:
if key not in cur_keys:
if key in ["artist", "keyword"]:
continue
merged_slots.append((key, value))
return merged_slots
if __name__ == "__main__":
if len(sys.argv) not in [2, 3]:
print >> sys.stderr, "USAGE: python %s domain [dialog_num(INTEGER)]" % sys.argv[0]
sys.exit(1)
domain = sys.argv[1]
if len(sys.argv) == 3:
try:
dialog_num = int(sys.argv[2])
except TypeError:
print >> sys.stderr, "USAGE: python %s domain dialog_num(INTEGER)" % sys.argv[0]
if domain == "entertain":
generator_entertain(domain)
else:
generator(domain, dialog_num)
"""
for i in range(20):
slots, _ = extract_category_slot("帮我把地图缩小", {}, navi_domain_query, 1, 6)
for s in slots:
print s
"""
transform_matrix
#!/usr/bin/python
#-*-coding=utf-8-*-
# Command:
# Function:
# 定义了对话状态转移矩阵
# Input:
# Format:
# Output:
# Format:
from collections import defaultdict
mode = "context"
#mode = "dst"
#################################################### carcontrol ###############################################
window_keys = {"context": ["carcontrol-open_window-#", "carcontrol-open_window-window_ratio",
"carcontrol-open_window-position", "carcontrol-open_window-position#window_ratio",
"carcontrol-close_window-#", "carcontrol-close_window-window_ratio",
"carcontrol-close_window-position", "carcontrol-close_window-position#window_ratio",
"carcontrol-up_window-#", "carcontrol-up_window-window_ratio",
"carcontrol-up_window-position", "carcontrol-up_window-position#window_ratio",
"carcontrol-down_window-#", "carcontrol-down_window-window_ratio",
"carcontrol-down_window-position", "carcontrol-down_window-position#window_ratio"],
"dst": []}
sunroof_keys = {"context": ["carcontrol-open_sunroof-#", "carcontrol-open_sunroof-window_ratio",
"carcontrol-close_sunroof-#", "carcontrol-close_sunroof-window_ratio",
"carcontrol-up_sunroof-#", "carcontrol-up_sunroof-window_ratio",
"carcontrol-down_sunroof-#", "carcontrol-down_sunroof-window_ratio"],
"dst": []}
air_volume_keys = {"context": ["carcontrol-up_air_volume-#", "carcontrol-up_air_volume-volume_degree",
"carcontrol-down_air_volume-#", "carcontrol-down_air_volume-volume_degree",
"carcontrol-set_air_volume-volume_degree", "carcontrol-set_air_volume_high-#",
"carcontrol-set_air_volume_high-volume_degree", "carcontrol-set_air_volume_low-#",
"carcontrol-set_air_volume_low-volume_degree"],
"dst": []}
air_temperature_keys = {"context": ["carcontrol-up_air_temperature-#", "carcontrol-up_air_temperature-air_degree",
"carcontrol-down_air_temperature-#", "carcontrol-down_air_temperature-air_degree",
"carcontrol-set_air_temperature-air_degree", "carcontrol-set_air_temperature_high-#",
"carcontrol-set_air_temperature_high-air_degree", "carcontrol-set_air_temperature_low-#",
"carcontrol-set_air_temperature_low-air_degree"],
"dst": []}
heat_seat_keys = {"context": ["carcontrol-open_heat_seat-#", "carcontrol-open_heat_seat-position",
"carcontrol-close_heat_seat-#", "carcontrol-close_heat_seat-position"],
"dst": []}
carcontrol_intent_transform = {"carcontrol-up_air_volume-volume_degree": set(air_volume_keys[mode]),
"carcontrol-down_air_volume-volume_degree": set(air_volume_keys[mode]),
"carcontrol-set_air_volume_high-volume_degree": set(air_volume_keys[mode]),
"carcontrol-set_air_volume_low-volume_degree": set(air_volume_keys[mode]),
"carcontrol-up_air_temperature-air_degree": set(air_temperature_keys[mode]),
"carcontrol-down_air_temperature-air_degree": set(air_temperature_keys[mode]),
"carcontrol-set_air_temperature_high-air_degree": set(air_temperature_keys[mode]),
"carcontrol-set_air_temperature_low-air_degree": set(air_temperature_keys[mode]),
"carcontrol-up_sunroof-window_ratio": set(sunroof_keys[mode]),
"carcontrol-down_sunroof-window_ratio": set(sunroof_keys[mode]),
"carcontrol-up_window-window_ratio": set(window_keys[mode]),
"carcontrol-up_window-position#window_ratio": set(window_keys[mode]),
"carcontrol-down_window-window_ratio": set(window_keys[mode]),
"carcontrol-down_window-position#window_ratio": set(window_keys[mode])}
carcontrol_intent_exclude = {"context": {
"carcontrol-set_air_volume_high-#": {air_volume_keys["context"][i] for i in [0, 1, 5, 6]},
"carcontrol-set_air_volume_high-volume_degree": {air_volume_keys["context"][i] for i in [0, 1, 5, 6]},
"carcontrol-set_air_volume_low-#": {air_volume_keys["context"][i] for i in [2, 3, 7, 8]},
"carcontrol-set_air_volume_low-volume_degree": {air_volume_keys["context"][i] for i in [2, 3, 7, 8]},
"carcontrol-set_air_temperature_high-#": {air_temperature_keys["context"][i] for i in [0, 1, 5, 6]},
"carcontrol-set_air_temperature_high-air_degree": {air_temperature_keys["context"][i] for i in [0, 1, 5, 6]},
"carcontrol-set_air_temperature_low-#": {air_temperature_keys["context"][i] for i in [2, 3, 7, 8]},
"carcontrol-set_air_temperature_low-air_degree": {air_temperature_keys["context"][i] for i in [2, 3, 7, 8]},
"carcontrol-open_heat_seat-#": {heat_seat_keys["context"][0]},
"carcontrol-open_heat_seat-position": {heat_seat_keys["context"][0]},
"carcontrol-close_heat_seat-#": {heat_seat_keys["context"][2]},
"carcontrol-close_heat_seat-position": {heat_seat_keys["context"][2]},
"carcontrol-up_air_volume-#": {air_volume_keys["context"][6]},
"carcontrol-up_air_volume-volume_degree": {air_volume_keys["context"][6]},
"carcontrol-down_air_volume-#": {air_volume_keys["context"][5]},
"carcontrol-down_air_volume-volume_degree": {air_volume_keys["context"][5]},
"carcontrol-up_window-window_ratio": {window_keys["context"][0]},
"carcontrol-up_window-position#window_ratio": {window_keys["context"][0]},
"carcontrol-down_window-window_ratio": {window_keys["context"][0]},
"carcontrol-down_window-position#window_ratio": {window_keys["context"][0]},
"carcontrol-up_sunroof-window_ratio": {sunroof_keys["context"][0]},
"carcontrol-down_sunroof-window_ratio": {sunroof_keys["context"][0]}},
"dst": {}}
#################################################### setting ###############################################
sound_value_keys = {"context": ["setting-volume_up-#", "setting-volume_up-value",
"setting-volume_down-#", "setting-volume_down-value",
"setting-volume_min-#", "setting-volume_min-value",
"setting-volume_max-#", "setting-volume_max-value",
"setting-set_volume-value", "setting-mute-#", "setting-unmute-#",
"setting-navi_up-#", "setting-navi_down-#", "setting-navi_mute-#",
"setting-navi_up-value", "setting-navi_down-value"],
"dst": ["setting-volume_down-value", "setting-volume_up-value",
"setting-volume_max-value", "setting-volume_min-value",
"setting-set_volume-value", "setting-mute-category", "setting-unmute-category"]}
setting_intent_transform = {key: set(sound_value_keys[mode]) for key in ["setting-volume_up-value",
"setting-volume_down-value",
"setting-volume_max-value",
"setting-volume_min-value",
"setting-mute-category",
"setting-unmute-category",
"setting-navi_up-#",
"setting-navi_up-value",
"setting-navi_down-#",
"setting-navi_down-value",
"setting-navi_mute-#"]}
setting_intent_exclude = {"context": {"setting-volume_min-#": {sound_value_keys["context"][i] for i in [2, 3, 4, 5, 10]},
"setting-volume_min-value": {sound_value_keys["context"][i] for i in [2, 3, 4, 5, 10]},
"setting-volume_max-#": {sound_value_keys["context"][i] for i in [0, 1, 6, 7, 10]},
"setting-volume_max-value": {sound_value_keys["context"][i] for i in [0, 1, 6, 7, 10]},
"setting-volume_up-#": {sound_value_keys["context"][10]},
"setting-volume_up-value": {sound_value_keys["context"][10]},
"setting-volume_down-#": {sound_value_keys["context"][10]},
"setting-volume_down-value": {sound_value_keys["context"][10]},
"setting-mute-#": {sound_value_keys["context"][i] for i in [2, 3, 4, 5, 9]},
"setting-mute-value": {sound_value_keys["context"][i] for i in [2, 3, 4, 5, 9]},
"setting-navi_up-#": set(sound_value_keys["context"][:11]),
"setting-navi_up-value": set(sound_value_keys["context"][:11]),
"setting-navi_down-#": set(sound_value_keys["context"][:11]),
"setting-navi_down-value": set(sound_value_keys["context"][:11]),
"setting-navi_mute-#": {sound_value_keys["context"][i] for i in range(11)+[13]}},
"dst": {"setting-volume_min-value": {sound_value_keys["dst"][i] for i in [0, 3, 6]},
"setting-volume_max-value": {sound_value_keys["dst"][i] for i in [1, 2, 6]},
"setting-volume_up-value": {sound_value_keys["dst"][6]},
"setting-volume_down-value": {sound_value_keys["dst"][6]},
"setting-mute-category": {sound_value_keys["dst"][i] for i in [0, 3, 5]},
"setting-unmute-category": {sound_value_keys["dst"][i] for i in [6]}}}
#################################################### weather ###############################################
get_weather_value_keys = {"context": ["weather-get_weather-#", "weather-get_weather-geo",
"weather-get_weather-time", "weather-get_weather-geo#time"],
"dst": []}
get_rain_value_keys = {"context": ["weather-get_rain-#", "weather-get_rain-geo",
"weather-get_rain-time", "weather-get_rain-geo#time"],
"dst": []}
get_temperature_value_keys = {"context": ["weather-get_temperature-#", "weather-get_temperature-geo",
"weather-get_temperature-time", "weather-get_temperature-geo#time"],
"dst": []}
get_humidity_value_keys = {"context": ["weather-get_humidity-#", "weather-get_humidity-geo",
"weather-get_humidity-time", "weather-get_humidity-geo#time"],
"dst": []}
get_haze_value_keys = {"context": ["weather-get_haze-#", "weather-get_haze-geo",
"weather-get_haze-time", "weather-get_haze-geo#time"],
"dst": []}
get_snow_value_keys = {"context": ["weather-get_snow-#", "weather-get_snow-geo",
"weather-get_snow-time", "weather-get_snow-geo#time"],
"dst": []}
get_hail_value_keys = {"context": ["weather-get_hail-#", "weather-get_hail-geo",
"weather-get_hail-time", "weather-get_hail-geo#time"],
"dst": []}
get_wind_value_keys = {"context": ["weather-get_wind-#", "weather-get_wind-geo",
"weather-get_wind-time", "weather-get_wind-geo#time"],
"dst": []}
weather_intent_transform = {"weather-get_humidity-geo#time": get_humidity_value_keys[mode],
"weather-get_wind-geo": get_wind_value_keys[mode],
"weather-get_hail-geo": get_hail_value_keys[mode],
"weather-get_snow-geo": get_snow_value_keys[mode]}
#################################################### geo+time ###############################################
traffic_control_value_keys = {"context": ["info_inquire-traffic_control-#", "info_inquire-traffic_control-geo",
"info_inquire-traffic_control-time", "info_inquire-traffic_control-geo#time"],
"dst": []}
road_condition_value_keys = {"context": ["road_condition-search_road_condition-#", "road_condition-search_road_condition-geo"],
"dst": []}
navi_geo_value_keys = {"context": ["navigation-navigate-#", "navigation-get_how_long-#", "navigation-get_how_far-#",
"navigation-transit_poi-#", "navigation-navigate-geo", "navigation-get_how_long-geo",
"navigation-get_how_far-geo", "navigation-transit_poi-geo"],
"dst": []}
geo_time_from_keys = {"context": get_weather_value_keys[mode][1:] + get_rain_value_keys[mode][1:] + get_snow_value_keys[mode][1:] +
get_temperature_value_keys[mode][1:] + get_humidity_value_keys[mode][1:] + get_haze_value_keys[mode][1:] +
get_hail_value_keys[mode][1:] + get_wind_value_keys[mode][1:] + traffic_control_value_keys[mode][1:] +
road_condition_value_keys[mode][1:] + navi_geo_value_keys[mode][4:],
"dst": []}
geo_time_to_keys = {"context": get_weather_value_keys[mode] + get_rain_value_keys[mode] + get_snow_value_keys[mode] +
get_temperature_value_keys[mode] + get_humidity_value_keys[mode] + get_haze_value_keys[mode] +
get_hail_value_keys[mode] + get_wind_value_keys[mode] + traffic_control_value_keys[mode] +
road_condition_value_keys[mode] + navi_geo_value_keys[mode],
"dst": []}
geo_time_intent_transform = {from_key: geo_time_to_keys[mode] for from_key in geo_time_from_keys[mode]}
geo_time_exclude_elem = {(k, "info_inquire-traffic_control-#") for k in traffic_control_value_keys[mode]}
geo_time_exclude_elem.update({(k, "road_condition-search_road_condition-#") for k in road_condition_value_keys[mode]})
geo_time_exclude_elem.update({(k, "navigation-navigate-#") for k in [navi_geo_value_keys[mode][i] for i in [0, 4]]})
geo_time_exclude_elem.update({(k, "navigation-get_how_long-#") for k in [navi_geo_value_keys[mode][i] for i in [1, 5]]})
geo_time_exclude_elem.update({(k, "navigation-get_how_far-#") for k in [navi_geo_value_keys[mode][i] for i in [2, 6]]})
geo_time_exclude_elem.update({(k, "navigation-transit_poi-#") for k in [navi_geo_value_keys[mode][i] for i in [3, 7]]})
geo_time_exclude_elem.update({(k, "weather-get_weather-#") for k in get_weather_value_keys[mode]})
geo_time_exclude_elem.update({(k, "weather-get_rain-#") for k in get_rain_value_keys[mode]})
geo_time_exclude_elem.update({(k, "weather-get_temperature-#") for k in get_temperature_value_keys[mode]})
geo_time_exclude_elem.update({(k, "weather-get_humidity-#") for k in get_humidity_value_keys[mode]})
geo_time_exclude_elem.update({(k, "weather-get_haze-#") for k in get_haze_value_keys[mode]})
geo_time_exclude_elem.update({(k, "weather-get_snow-#") for k in get_snow_value_keys[mode]})
geo_time_exclude_elem.update({(k, "weather-get_hail-#") for k in get_hail_value_keys[mode]})
geo_time_exclude_elem.update({(k, "weather-get_wind-#") for k in get_wind_value_keys[mode]})
geo_time_exclude_elem.update({(k, v) for k in [traffic_control_value_keys[mode][2], get_weather_value_keys[mode][2],
get_rain_value_keys[mode][2], get_temperature_value_keys[mode][2],
get_humidity_value_keys[mode][2], get_haze_value_keys[mode][2],
get_snow_value_keys[mode][2], get_hail_value_keys[mode][2], get_wind_value_keys[mode][2]]
for v in {"navigation-navigate-#", "navigation-get_how_long-#", "navigation-get_how_far-#",
"navigation-transit_poi-#", "road_condition-search_road_condition-#", "info_inquire-traffic_control-#"}})
geo_time_intent_exclude = {"context": defaultdict(set),
"dst": defaultdict(set)}
for k, v in geo_time_exclude_elem:
geo_time_intent_exclude[mode][k].add(v)
intent_transform = {"carcontrol": carcontrol_intent_transform,
"setting": setting_intent_transform,
"weather": weather_intent_transform,
"geo_time_slot": geo_time_intent_transform}
intent_transform_exclude = {"carcontrol": carcontrol_intent_exclude[mode],
"setting": setting_intent_exclude[mode],
"geo_time_slot": geo_time_intent_exclude[mode]}
utils
#!/usr/bin/python
#-*-coding=utf-8-*-
# Command:
# Function:
# Input:
# Format:
# Output:
# Format:
import sys
import random
from collections import defaultdict
from configs import *
sys.path.append("../")
# ############################## 纯工具, 生成可枚举slot-values #################################
hanzi_num = ["", "零", "一", "二", "三", "四", "五", "六", "七", "八", "九", "十", "百"]
num = [str(i) for i in range(10)] + ["10", "100"]
def build_degree():
pre_hanzi = hanzi_num[3:]
pre_num = num[2:]
post_1 = hanzi_num[1:-2] +\
["".join([i, "十", j]) for i in hanzi_num[0:1]+hanzi_num[3:-2] for j in hanzi_num[0:1]+hanzi_num[2:-2]] +\
hanzi_num[-1:] + ["一百"]
post_2 = hanzi_num[1:-1] +\
["".join([i, j]) for i in hanzi_num[3:-1] for j in hanzi_num[2:-2]] + hanzi_num[-1:] + ["一百"]
post_3 = num[:-1] + ["".join([i, j]) for i in num[1:-2] for j in num[:-2]] + num[-1:]
ratio_f = open(ratio_value_file, "w")
temperature_f = open(temperature_value_file, "w")
volume_f = open(air_volume_value_file, "w")
for i in range(len(pre_hanzi)-1):
for j in range(i+2):
ratio_f.write("{}\n".format("".join([pre_hanzi[i], "分之", post_1[j]])))
ratio_f.write("{}\n".format("".join([pre_hanzi[i], "分之", post_3[j]])))
for i in range(len(pre_num)-1):
for j in range(i+2):
ratio_f.write("{}\n".format("".join([pre_num[i], "分之", post_1[j]])))
ratio_f.write("{}\n".format("".join([pre_num[i], "分之", post_3[j]])))
for p in post_1:
ratio_f.write("{}\n".format("".join([pre_hanzi[-1], "分之", p])))
ratio_f.write("{}\n".format("".join([pre_num[-1], "分之", p])))
for p in post_2:
ratio_f.write("{}\n".format("".join([pre_hanzi[-1], "分之", p])))
ratio_f.write("{}\n".format("".join([pre_num[-1], "分之", p])))
for p in post_3:
ratio_f.write("{}\n".format("".join([pre_hanzi[-1], "分之", p])))
ratio_f.write("{}\n".format("".join([pre_num[-1], "分之", p])))
for p in ["百分百", "一半"]:
ratio_f.write("{}\n".format(p))
for p in set(post_1 + post_3):
ratio_f.write("{}%\n".format(p))
res = set(post_1 + post_2 + post_3)
for i in list(res):
temperature_f.write("{}\n".format(i))
temperature_f.write("{}度\n".format(i))
volume_f.write("{}\n".format(i))
volume_f.write("{}档\n".format(i))
volume_f.write("{}格\n".format(i))
temperature_f.write("两度\n")
volume_f.write("两档\n")
volume_f.write("两格\n")
# ############################## 跟slot相关的操作 #################################
def generate_substr(query, min_len, max_len):
substr_pool = set()
query = query.decode("utf-8")
num = len(query)
max_len = min(max_len, num)
for i in range(min_len, max_len+1):
for j in range(num):
if j <= num - i:
substr_pool.add(query[j:i+j].encode("utf-8"))
return substr_pool
def tagger(query, query_substr_pool, slot_pool):
"""
:param query: 待打标的query
:param query_substr_pool 从query中抽取的子串集合, set()
:param slot_pool: 某种类型slot值的候选集, set()
:return: query中需要被打标的substr集合
"""
candidate = [v for v in query_substr_pool & slot_pool]
candidate = sorted(candidate, key=lambda i: len(i.decode("utf-8")), reverse=True)
remove = set()
for i in range(1, len(candidate)):
for j in range(i):
if candidate[i] in candidate[j]:
remove.add(candidate[i])
break
slots = set()
for c in candidate:
if c not in remove and query.find(c) != -1:
slots.add(c)
query = query.replace(c, "")
return list(slots)
def extract_category_slot(domain, intent, query, query_substr_pool, domain_query, min_len, max_len):
slots = []
if query in query_substr_pool and query_substr_pool[query]:
query_substr = query_substr_pool[query]
else:
query_substr = generate_substr(query, min_len, max_len)
query_substr_pool[query] = query_substr
category_values = tagger(query, query_substr, domain_query)
for value in category_values:
new_value = category_value_trans(domain, intent, value)
if new_value:
slots.append(("category", new_value))
return slots, query_substr_pool
def category_value_trans(domain, intent, value):
if domain == "setting":
if intent not in navi_intents:
if value == "导航音量":
value = "音量"
elif value in {"导航", "导航播报", "路况播报"}:
value = ""
return value
def load_slot_values(slot_value_files):
"""
:param slot_value_files: 存放slot所有value的文件
:return:
"""
values = defaultdict(lambda: defaultdict(set))
for slot_value_file in slot_value_files:
for ln in open(slot_value_file, "r"):
info = ln.strip().split("\001")
if len(info) < 2:
continue
value, intent_slots = info[:2]
for intent_slot in intent_slots.split("|"):
intent_slot_info = intent_slot.split(":")
if len(intent_slot_info) != 2:
continue
intent, slot = intent_slot_info[:]
values[intent][slot].add(value)
if value[-1] == "%":
values[intent][slot].add(value.replace("%", "%"))
if values:
values = {i: {s: list(v) for s, v in s_q.iteritems()} for i, s_q in values.iteritems()}
return values
def slot_replace(slot_values, query, query_intent, query_slots, value_map):
"""
:param slot_values: 所有slot和value的集合, {"intent1": {"slot1": [values], "slot2": [values]}, "intent2":...}
:param query: 待替换的query
:param query_intent: query所属intent
:param query_slots: query中存在的slot-value集合, [(k1, v1), (k2, v2), ...]
:param value_map: 抽取出的value与最终使用的values的映射关系, 做cache
:return: 进行slot替换后的query
"""
if query_intent not in slot_values:
return query, query_slots, value_map
new_slot = []
for slot_key, slot_value in query_slots:
if slot_key not in slot_values[query_intent]:
new_slot.append((slot_key, slot_value))
continue
if query.find(slot_value) == -1:
if slot_value.decode("utf-8")[-1] == u"%":
slot_value = slot_value[:-1] + "%"
elif slot_value.decode("utf-8")[-1] == u"%":
slot_value = slot_value[:-1] + "%"
if query.find(slot_value) == -1:
"""
print >> sys.stderr,\
"\t".join([query, query_intent, "|".join([":".join([k, v]) for k, v in query_slots])])
"""
new_slot.append([slot_key, slot_value])
continue
# 如果当前的value不在槽位的候选集里, 说明是特殊value, 不进行替换
# 比如“再大一点”中value=“一点”属于特殊槽位值, 该槽位值只用于上下文槽位merge时使用, 而不应用于数据增强
value = random.choice(slot_values[query_intent][slot_key])
if query_intent in random_replace_value and slot_value in random_replace_value[query_intent]:
if random.random() < 0.5:
value = slot_value
if slot_key == "position":
while value in all_position:
value = random.choice(slot_values[query_intent][slot_key])
elif slot_key in ["air_degree", "volume_degree", "value"]:
while value == "一":
value = random.choice(slot_values[query_intent][slot_key])
query = query.replace(slot_value, value)
"""
if value in value_map:
values = value_map[value]
else:
values = set()
if slot_key == "time":
if "前" in value or "后" in value or "的" in value:
lengths = [len(v.decode("utf-8")) for v in slot_values[query_intent][slot_key]]
max_len, min_len = max(lengths), min(lengths)
v_substr = generate_substr(value, min_len, max_len)
candi = v_substr & set(slot_values[query_intent][slot_key])
if len(candi) == 1:
values = candi
else:
values = {v for v in candi if v != value}
else:
values = {value}
else:
values = {value}
value_map[value] = values
for v in values:
new_slot.append((slot_key, v))
"""
new_slot.append((slot_key, value))
return query, new_slot, value_map
def merge_slots(pre_state, cur_state):
"""
DST的规则实现
:param pre_state: 上一轮对话的state: [domain, intent, [(k1, v1), (k2, v2), ...]]
:param cur_state: 当前轮对话的state: [domain, intent, [(k1, v1), (k2, v2), ...]]
:return: 合并后的slots: [(k1, v1), (k2, v2), ...]
"""
pre_domain, pre_intent, pre_slots = pre_state[:]
cur_domain, cur_intent, cur_slots = cur_state[:]
cur_keys = defaultdict(list)
for k, v in cur_slots:
cur_keys[k].append(v)
add_slots, remove_slots = [], []
cur_domain_intent = "-".join([cur_domain, cur_intent])
if cur_domain == "carcontrol":
add_slots, remove_slots = carcontrol_merge_slots(pre_slots, cur_keys)
elif cur_domain == "setting":
add_slots, remove_slots = setting_merge_slots(pre_slots, cur_intent, cur_keys)
#elif cur_domain in ["map", "navigation", "road_condition"]:
elif cur_domain_intent in map_function | navi_function | road_condition_function:
add_slots, remove_slots = category_merge_slots(pre_slots, cur_keys)
elif cur_domain_intent in geo_time_function:
add_slots, remove_slots = geo_time_merge_slots(pre_state, cur_state)
# elif cur_domain == "weather":
# add_slots, remove_slots = weather_merge_slots(pre_slots, cur_keys)
new_slots = [(k, v) for k, v in cur_slots if (k, v) not in remove_slots]
new_slots += add_slots
return new_slots
def carcontrol_merge_slots(pre_slots, cur_keys):
add_slots = []
for k, v in pre_slots:
if k == "position":
if k not in cur_keys or cur_keys[k] in ["另一边", "另边"]:
add_slots.append((k, v))
elif k == "window_ratio":
if "position" in cur_keys and "window_ratio" not in cur_keys:
add_slots.append((k, v))
elif k in ["air_degree", "volume_degree"]:
continue
elif k not in cur_keys:
add_slots.append((k, v))
return add_slots, []
def setting_merge_slots(pre_slots, cur_intent, cur_keys):
add_slots, remove_slots = set(), set()
for k, v in pre_slots:
if k == "category":
reserve_pre = True
navi_cur = False
all_in_pre = True
if v in ["静音", "取消静音", "导航静音"]:
continue
if k not in cur_keys:
add_slots.add((k, v))
continue
if v not in cur_keys[k]:
for cur_v in cur_keys[k]:
if "导航" in cur_v or "路况" in cur_v or "地图" in cur_v:
navi_cur = True
if v in cur_v:
reserve_pre = False
break
elif cur_v in v or (len(v.decode("utf-8")) > 1 and len(cur_v.decode("utf-8")) == 1):
remove_slots.add((k, cur_v))
else:
all_in_pre = False
else:
reserve_pre = False
if reserve_pre:
if not navi_cur and ("导航" in v or "路况" in v or "地图" in v):
add_slots.add((k, v))
if all_in_pre:
add_slots.add((k, v))
elif k == "value":
if k not in cur_keys and "category" in cur_keys:
reserve_pre = False
for cur_v in cur_keys["category"]:
if "导航" in cur_v or "地图" in cur_v or "路况" in cur_v:
if cur_intent not in ["mute", "unmute", "navi_mute"]:
reserve_pre = True
if reserve_pre:
add_slots.add((k, v))
return list(add_slots), list(remove_slots)
def geo_time_merge_slots(pre_state, cur_state):
pre_domain, pre_intent, pre_slots = pre_state[:]
cur_domain, cur_intent, cur_slots = cur_state[:]
cur_keys = set([k for k, v in cur_slots])
add_slots, remove_slots = [], []
geo_keys = {"geo", "home", "company"}
for k, v in pre_slots:
# geo槽位直接继承
if k in geo_keys and not geo_keys & cur_keys:
add_slots.append((k, v))
# time槽位分域继承
if cur_domain in {"info_inquire", "weather"}:
if k == "time" and k not in cur_keys:
add_slots.append((k, v))
# 同域的category继承
if pre_domain == cur_domain:
if k == "category" and k not in cur_keys:
add_slots.append((k, v))
return add_slots, remove_slots
def category_merge_slots(pre_slots, cur_keys):
add_slots, remove_slots = [], []
for k, v in pre_slots:
if k == "category" and k not in cur_keys:
add_slots.append((k, v))
return add_slots, remove_slots
def intent_mapping(intent, slots):
new_intent = intent
setting_intent_map = {"volume_up": "navi_up", "volume_down": "navi_down", "mute": "navi_mute"}
if intent in setting_intent_map:
is_navi = False
for key, value in slots:
if key == "category" and ("导航" in value or "路况" in value):
is_navi = True
break
if is_navi:
new_intent = setting_intent_map[intent]
elif intent == "play_music":
new_intent = "positive_play"
return new_intent
# ############################## 合理性验证工具 #################################
def valid_state_transform(pre_state, cur_state):
"""
:param pre_state: 上一轮对话的dialog state: [domain, intent, [(k1, v1), (k2, v2), ...]]
:param cur_state: 当前轮对话的dialog state: [query, domain, intent, [(k1, v1), (k2, v2), ...]]
:return: 判断pre_state向cur_state的转移是否合理
"""
pre_domain, pre_intent, pre_slots = pre_state[:]
cur_query, cur_domain, cur_intent, cur_slots = cur_state[:]
pre_keys = {k: v for k, v in pre_slots}
cur_keys = {k: v for k, v in cur_slots}
if pre_domain == "carcontrol":
degree_slot_keys = {"window_ratio", "air_degree", "volume_degree"}
# 全部关闭后不可再次关闭
if "close" in pre_intent and "close" in cur_intent and not set(pre_keys.keys()) & degree_slot_keys:
if {("position", v) for v in all_position} & set(pre_slots):
return False
# 位置信息前后不可重复
if "position" in pre_keys and "position" in cur_keys:
pre_pos, cur_pos = pre_keys["position"], cur_keys["position"]
if pre_pos in norm_position and cur_pos in norm_position:
pre_position = set(norm_position[pre_pos].split("|")[0].split(","))
cur_position = set(norm_position[cur_pos].split("|")[0].split(","))
if pre_position & cur_position:
return False
# 前后intent和slots都相同
if cur_intent == pre_intent and same_tuple_list(pre_slots, cur_slots):
return False
return True
def valid_query_transform(pre_state, cur_state):
"""
:param pre_state: 上一轮对话的dialog state: [domain, intent, [(k1, v1), (k2, v2), ...]]
:param cur_state: 当前轮对话的dialog state: [query, domain, intent]
:return: 判断当前选择的query是否合适
"""
pre_domain, pre_intent, pre_slots = pre_state[:]
cur_query, cur_domain, cur_intent = cur_state[:]
# 如果前文是普通音量变化, 当前是导航音量变化, 但是当前query中不包含导航等字眼, 这种case需要删除
if pre_intent not in navi_intents and cur_intent in navi_intents:
if "导航" not in cur_query and "地图" not in cur_query and "路况" not in cur_query:
return False
# 如果当前query属于纯转导航音量的query(比如"我是说导航音量"):
# 如果前文是导航音量变化, 则删除这种case
# 如果前文不是导航音量变化, 则需要判断上下文意图的转移是否在setting_valid_pair里, 不是的话则删除
setting_valid_pair = {("volume_up", "navi_up"), ("volume_down", "navi_down"), ("mute", "navi_mute"),
("volume_max", "navi_up"), ("volume_min", "navi_down"), ("unmute", "navi_up")}
if cur_query in pure_navi_volume_query:
if pre_intent in ["navi_up", "navi_down", "navi_mute"]:
return False
elif (pre_intent, cur_intent) not in setting_valid_pair:
return False
# 如果前文意图为set_volume, 当前意图为navi_up/navi_down/navi_mute, 则要求当前query必须带“导航”、“路况”、“地图”字眼
# 因为会生成以下两种case:导航音量设置成89格(set_volume) -> 低一点(navi_down)
# 音量设置成89格(set_volume) -> 低一点(volume_down)
# 目前暂时对以上两种case统一识别成volume_down
if pre_intent == "set_volume" and cur_intent in ["navi_up", "navi_down", "navi_mute"]:
if "导航" not in cur_query and "地图" not in cur_query and "路况" not in cur_query:
return False
return True
def check_include(query, includes):
"""
:param query: 原始query
:param includes: 带判断的词list
:return: 如果query中包含includes中任意一个元素, 则返回True, 否则返回False
"""
for i in includes:
if i in query:
return True
return False
def same_tuple_list(list_1, list_2):
"""
:param list_1: [(k1, v1), (k2, v2), ...]
:param list_2: [(k1, v1), (k2, v2), ...]
:return: 如果list_1和list_2元素完全相同, 返回True; 否则返回False
"""
if len(list_1) != len(list_2):
return False
if not list_1 or not list_2:
return False
for k, v in list_1:
if (k, v) not in list_2:
return False
return True
def print_internal_log(first_turn_info, intent_twice_turn_info, slot_twice_turn_info, transform_matrix):
for k in first_turn_info.keys():
print "\t".join(["first_turn", k])
for k in slot_twice_turn_info.keys():
print "\t".join(["second_turn", k])
for k in intent_twice_turn_info.keys():
print "\t".join(["second_turn", k])
for k, v in transform_matrix.iteritems():
print " -> ".join([k, "\t".join(v)])
if __name__ == "__main__":
"""
q = "一群有情有义的人"
s1 = generate_substr(q, 2, 20)
print "\t".join(list(s1))
"""
build_degree()
状态转移概率矩阵原理
r(i,j) = 0.3 0.3 0.4
0.3 0.2 0.5
0.1 0.3 0.6
列名为 状态名称
r12 表示从状态a到状态b的概率。r23 表示 从状态b到状态c的概率
接下来的问题是这个表中的概率值是如何确定呢???????????????????????????????? 有训练集统计得来
应用马尔科夫模型生成文本
参考: https://blog.csdn.net/wanght89/article/details/78199656
from urllib.request import urlopen
from random import randint
def wordListSum(wordList):
sum=0
for word,value in wordList.items():
sum+=value
return sum
def retrieveRandomWord(wordList):
randIndex=randint(1,wordListSum(wordList))
for word,value in wordList.items():
randIndex-=value
if randIndex<0:
return word
def buildWordDict(text):
#剔除换行符和引号
text=text.replace("\n"," ")
text=text.replace("\"","")
text=text.replace("--","")
#保证每个标点符号都和前面的单词在一起
#这样不会被剔除,保留在马尔科夫链中
punctuation=[',','.',';',':']
for symbol in punctuation:
text=text.replace(symbol," "+symbol+" ")
words=text.split(" ")
#过滤空单词
words=[word for word in words id word !=""]
wordDict={}
for i in range(1,len(words)):
if words[i-1] not in wordDict:
#为单词新建一个字典
wordDict[words[i-1]]={}
if words[i] not in wordDict[words[i-1]]:
wordDict[words[i-1]][words[i]] = 0
wordDict[words[i-1]][words[i]]=wordDict[words[i-1]][words[i]]+1
return wordDict
text=str(urlopen("http://pythonscraping.com/files/inaugurationSpeech.txt").read(),'utf-8')
wordDict=buildWordDict(text)
#生成链长为100的马尔科夫链
length=100
chain=""
currentWord="I"
for i in range(0,length):
chain += str(currentWord)
currentWord=retrieveRandomWord(wordDict[currentWord])
print(chain )
3、笛卡尔积
4、全对偶选取
测试专家James Bach使用全对偶测试法(All Pairs)将这125个测试用例压缩成25个测试用例,从理论上来覆盖变量组合的测试覆盖率,全对偶测试法有个基本的原则是每个变量的每个取值都要与至少一个用例中的每个其他变量的每个取值配对。
需要的是在一组测试用例中能够保证至少一个用例中的每个其他变量的每个取值都配对过。这种情况就可以得到较少的测试用例,我们就可以得到6个测试用例
有pairs工具 和 python实现的代码。
说明:
import random
import pandas as pd
import os
import re
class GenContent(object):
def __init__(self):
pass
def get_sta_lab(self, filepath):
"""
目录下的所有文件转化为以文件名为键,文件内容列表为值的字典
:param filepath: 目标目录
:return: 字典
"""
dict_col = {}
filelist = os.listdir(filepath)
file_col = []
for i in filelist:
print(os.path.join(filepath, i))
file_col.append(os.path.join(filepath, i))
for j in file_col:
df = pd.read_excel(j, index=None, header=None)
intent = os.path.splitext(os.path.basename(j))[0]
print("intent === ", intent)
dict_col[intent] = list(df[0])
return dict_col
def _comps(self, a, b, all_a=True):
"""
组合a,b两个列表,如输入[1,2],[3,4] 组合为[[1,3],[1,4],[2,3],[2,4]],
可以有条件的刷选组合后对象。
:param a: []
:param b: []
:param all_a: bool,True,无条件组合全部放入返回列表中。False,有条件筛选
:return: list,如输入[1,2],[3,4] 组合为[[1,3],[1,4],[2,3],[2,4]]
"""
gen_ = []
for i in a:
for j in b:
col = []
if isinstance(i, list):
col.extend(i)
else:
col.append(i)
if all_a:
col.append(j)
gen_.append(col)
else:
if j != col[-1]:
col.append(j)
gen_.append(col)
return gen_
def gen_con(self, *a):
"""
对多组元素做组合,输入[1,2],[3,4],[5,] 组合为[[1,3,5],[1,4,5],[2,3,5],[2,4,5]],
:param a: 多组元素
:return: list
"""
col = []
n = len(a)
for i in range(n):
if n == 2:
col = self._comps(a[0], a[1])
break
else:
if i == 0:
col = a[i]
print("col==", col)
continue
else:
col = self._comps(col, a[i])
# else:
# return col
return col
def list_del_obj(self, a):
"""
列表嵌套列表去重
:param a: 列表嵌套列表
:return: 列表
"""
col = []
for i in a:
if i not in col:
col.append(i)
return col
def gen_context_with_dicor(self, *seq):
"""
以笛卡尔积原理产生上下文
:return: 列表。如:[["xxx","yyy","zzz"], ["xxx", "rrr", "zzz"]]
"""
dic = []
from itertools import product
# for x, y, z, k in product(first, sec, third, four):
for x, y, z, k in product(*seq):
dic.extend([x, y, z, k])
gen_list_all = self.list_del_obj(dic)
print("all_ ", len(gen_list_all))
return gen_list_all
# TODO
# for x, y, z, k in product(first, sec, third, four):
# xyzk写定位4个参数即4轮对话,应为动态参数。
def start_gen_context_words_with_random(self,
gen_list_all,
lab_dict,
sample_num):
"""
生成具体的多轮对话,保存在列表中。如[['导航到xxx','那里天气怎么样', '那里堵不堵'],
['导航到yyy','那里堵不堵','途径xxx'],...]
:param gen_list_all: 意图的组合列表
:param lab_dict: 从指定目录下解读所有文件后产生字典 dict = {"": [], "": []},
键为文件名即对话意图,值为实际话术语料的列表
lab_dict = {"changed_poi":["changed_poi", "changed_poi2"],
"search_road_condition": ["search_road_condition","search_road_condition2"],
"weather": ["query_weather","query_weather2"],
"navigate": ["navigate","navigate2"],
"get_how_far": ["get_how_far","get_how_far2"],
"get_how_long": ["get_how_long","get_how_long2"],
"transit_poi": ["transit_poi","transit_poi2"], "unknow":["u1","u2","u3"]}
:param sample_num: 从话术库(字典)中抽取多少个语句。
:return: list,如[['导航到xxx','那里天气怎么样', '那里堵不堵'],
['导航到yyy','那里堵不堵','途径xxx'],...]
"""
col_list = []
# 遍历组合的列表
for g in gen_list_all:
lab_col = []
# 遍历一组对话中的意图,随机抽取话术
for lab in g:
lab_ = random.sample(lab_dict[lab], sample_num)
if sample_num == 1:
lab_col.append(lab_[0])
else:
lab_col.append(lab_)
# 判断随机抽取的个数
if sample_num == 1:
if lab_col not in col_list:
col_list.append(lab_col)
else:
test_data = [list(i) for i in zip(*lab_col)]
for td in test_data:
# 后一句无geo,则继承前一句geo
td_ = self.add_spec_label_inherit(td)
# 此处相当于去重
if td_ not in col_list:
col_list.append(td_)
return col_list
def _del_geo_null(self, obj_speech):
col = []
for obj in obj_speech:
r = re.findall("geo\s*;|geo:\t", obj)
if len(r) > 0:
r_geo = re.sub("geo\s*;|geo:\t", "", obj)
col.append(r_geo)
else:
col.append(obj)
return col
def add_spec_label_inherit(self, speech, spec="geo"):
"""
在多轮对话跨域上下文中, 只涉及到导航路况和天气的跨域对话,
所以不存在时间time槽位的继承。只有地理位置继承。
:param speech:
:return:
"""
speech_ = speech[:]
# 先去除一遍空的geo
speech_ = self._del_geo_null(speech_)
col_spec_exist = []
col_unexist = []
l_sp = len(speech_)
for i in range(l_sp):
if spec in speech_[i]:
col_spec_exist.append(i)
else:
col_unexist.append(i)
# 判断首尾外的所有成员不含有指定标签,继承前面成员的标签
for j in col_unexist:
if j == 0:
continue
else:
try:
# 判断结尾是否有;,没有;则补充
if speech_[j][-1] != ";":
speech_[j] = "%s;" % speech_[j]
# 获取geo
r = re.search("(%s:\w+)" % spec, speech_[j-1])
spec_con = r.groups()[0]
speech_[j] = speech_[j] + spec_con
except Exception as e:
print(e)
print("======", speech)
return speech_
if __name__ == "__main__":
gc = GenContent()
sp = gc.add_spec_label_inherit(["我要去沙特缘大酒店堵不堵啊;road_condition;search_road_condition;geo:沙特缘大酒店",
"帮我导航去阳光雅苑;navigation;navigate;geo:阳光雅苑",
"堵不堵那里;road_condition;search_road_condition;",
"去那里有多远;navigation;get_how_far;"])
print(sp)
生成正样本集脚本
import json
from GenContext import GenContent
import time
fmt = '%Y_%m_%d_%H_%M_%S'
# 总体架构是 构建正样本集先将各个意图排列组合,
# 然后从一组组合中的一个意图中的文件中随机抽取1-2个语句,最后组成多轮会话。
# 使用说明:1 sample_num = 20 每种可能随机抽取20个。2
# 2 filepath = r"D:\work\banma_work\任务\NLP测试\跨域上下文query拓展" 将所有意图文件放置该目录下。
# 3 意图名称必须与文件名一致
# 4 需要排列组合的意图写在first等列表内。 具体在gen_con(first, sec)中指明哪些列表需要全排列。
# 5 all_a = False设置为False,不用笛卡尔积做无条件全排列,即使用有条件的全排列。
# 配置区域------------------------------------------
first = ["search_road_condition", "weather", "navigate", "get_how_far", "get_how_long"]
sec = ["changed_poi", "navigate_2", "get_how_long_2", "search_road_condition_2", "weather_2", "get_how_far_2", "transit_poi_2"]
third = ["changed_poi", "navigate_2", "get_how_long_2", "search_road_condition_2", "weather_2", "get_how_far_2", "transit_poi_2"]
four = ["changed_poi", "navigate_2", "get_how_long_2", "search_road_condition_2", "weather_2", "get_how_far_2", "transit_poi_2"]
timestamp = time.strftime(fmt, time.localtime())
savejson = "save_json_%s.txt" % timestamp
f_name = "save_gen_lab_%s.txt" % timestamp
filepath = r"D:\work\banma_work\任务\NLP测试\跨域上下文query拓展"
sample_num = 2
# 配置区域--------------------------------------------
gc = GenContent()
# 排列中无条件过滤时即全部排列都采用时 使用笛卡尔积的全排列方式
# gen_list_all = gc.gen_context_with_dicor(first, sec, third, four)
gen_list = gc.gen_con(first, sec, third, four)
print("genlisy==== ", len(gen_list))
gen_list_all = gc.list_del_obj(gen_list)
print("gen_list _ === ", len(gen_list_all))
# 从指定目录下解读所有文件后产生字典 dict = {"": [], "": []},
# 键为文件名即对话意图,值为实际话术语料的列表
# lab_dict = {"changed_poi":["changed_poi", "changed_poi2"],
# "search_road_condition": ["search_road_condition","search_road_condition2"],
# "weather": ["query_weather","query_weather2"],
# "navigate": ["navigate","navigate2"],
# "get_how_far": ["get_how_far","get_how_far2"],
# "get_how_long": ["get_how_long","get_how_long2"],
# "transit_poi": ["transit_poi","transit_poi2"], "unknow":["u1","u2","u3"]}
lab_dict = gc.get_sta_lab(filepath)
print("=======================已生成话术库,随机采样=========")
col_list = gc.start_gen_context_words_with_random(gen_list_all, lab_dict, sample_num)
print("=========最终生成%s组对话 ========== " % len(col_list))
col_list = gc.list_del_obj(col_list)
print("=========对其去重后还剩%s组对话=== " % len(col_list))
# 将列表col_list写入json
with open(savejson, mode="w", encoding="utf-8") as f:
json.dump(col_list, f)
with open(f_name, mode="a+", encoding="utf-8") as f:
for col_obj in col_list:
obj = "\t".join(col_obj)
f.write("%s\n" % obj)
print(len(gc.get_sta_lab(filepath)))

choices = {'changed_poi': ['weather', 'navigate', 'get_how_long', 'get_how_far', 'search_road_condition', 'changed_poi', 'transit_poi'], 'get_how_long': ['weather', 'search_road_condition', 'changed_poi'], 'weather': ['weather', 'navigate', 'get_how_long', 'get_how_far', 'search_road_condition', 'transit_poi'], 'navigate': ['weather', 'search_road_condition', 'changed_poi'], 'search_road_condition': ['weather', 'navigate', 'get_how_long', 'get_how_far', 'search_road_condition', 'changed_poi', 'transit_poi'], 'get_how_far': ['weather', 'search_road_condition', 'changed_poi']}
def add_node(current):
return [now + [next] for now in current for next in choices.get(now[-1], {}) if next]
r1 = [[c] for c in choices]
print(r1)
r2 = add_node([['search_road_condition'], ['navigate'], ['weather'], ['get_how_long'], ['get_how_far']])
print(r2)
r3 = add_node(r2)
r4 = add_node(r3)
print(r4)
print(len(r4))
2 函数式写法
def get_next(obj, choice):
col = []
for i in obj:
for j in choice.get(i[-1], {}):
col.append(i + [j,])
return col
def gen_dialog_intent(num, choice, init_intent):
c = 1
intent = init_intent[:]
while c <= num:
intent = get_next(intent, choice)
c += 1
return intent
init_intent = [['search_road_condition'], ['navigate'], ['weather'], ['get_how_long'], ['get_how_far']]
r = gen_dialog_intent(3, choices, init_intent)
print(len(r))
print(r)
浙公网安备 33010602011771号