class Parser4MergeRound1Reduce(BaseStrategy):
def __init__(self):
BaseStrategy.__init__(self, fd.strategy_type_common, "Parser4MergeRound1Reduce")
def run(self, info):
line = info[fd.raw_line]
items = line.strip().split('\t', 1)
if len(items) == 2:
info.clear()
info_tmp = serialize.un_serialize(items[1])
for key, value in info_tmp.iteritems():
info[key] = value
info[fd.sort_key] = items[0]
info[fd.raw_line] = line
else:
return False
return True
class ProcessMergeRound1Reduce(BaseStrategy):
def __init__(self):
BaseStrategy.__init__(self, fd.strategy_type_common, "ProcessMergeRound1Reduce")
self._curr_qid = '-'
self._clear_buffer()
def run(self, info):
if info[fd.sort_key] != self._curr_qid:
self.merge_by_qid()
self._clear_buffer()
self._curr_qid = info[fd.sort_key]
if info[fd.input_data_type] == fd.strategy_type_display:
#TODO 性能优化 by susui at 2015-5-12
self._buff = copy.deepcopy(info)
#self._buff = info
elif info[fd.input_data_type] == fd.strategy_type_click:
features = info[fd.in_features]
#key = features[fd.newcookie_query] + '\t' + features[fd.newcookie_qid]
key = features[fd.newcookie_query] + '\t' + features[fd.search_id]
info[fd.raw_line] = ''
self.normalize_urls(info)
jsonObj = serialize.serialize(info)
value = key + '\t' + jsonObj
print value
return True
else:
self._curr_search_qid = info[fd.search_id]
self._curr_query = info[fd.newcookie_query]
#added by zhangtongtong
if fd.file_name in info:
self._se_file_name = info[fd.file_name]
if fd.line_index in info:
self._se_line_index = info[fd.line_index]
return False
return False
def merge_by_qid(self):
if len(self._buff) < 3:
return False
if self._curr_query == '':
return False
features = self._buff[fd.in_features]
if self._curr_search_qid == '-':
features[fd.display_qid] = self._curr_qid
else:
features[fd.display_qid] = self._curr_search_qid
#added by zhangtongtong
self._buff[fd.file_name] = self._se_file_name
self._buff[fd.line_index] = self._se_line_index + 1
key = features[fd.display_query] + '\t' + features[fd.display_qid]
self._buff[fd.raw_line] = ''
features[fd.display_raw_urlinfos] = []
self.normalize_urls(self._buff)
jsonObj = serialize.serialize(self._buff)
value = key + '\t' + jsonObj
if value == '':
return False
else:
print value
return True
def done(self, info):
self.merge_by_qid()
def _clear_buffer(self):
self._buff = {}
self._curr_search_qid = '-'
self._curr_query = ''
#added by zhangtongtong
self._se_file_name = '-'
self._se_line_index = 0
def get_keysign(self, ori_url):
curr_head = ''
all_heads = ['http://', 'https://']
for head in all_heads:
if ori_url.startswith(head):
curr_head = head
break
special_prefix1 = curr_head + 'm.baidu.com'
if ori_url.startswith(special_prefix1):
new_url = util_parser.get_value_from_url_info(ori_url, 'keysign')
if new_url:
return new_url
special_prefix2 = curr_head + 'm5.baidu.com'
special_prefix_sf = curr_head + '/sf?pd=video_page'
if ori_url.startswith(special_prefix1) or \
ori_url.startswith(special_prefix2) or \
ori_url.startswith(special_prefix_sf):
ori_url = util.normalize_url(ori_url)
ext = util_parser.get_value_from_url_info(ori_url, 'ext')
if ext:
ext_dict = json.loads(ext)
new_url = ext_dict.get('log_loc', '')
if new_url:
return new_url
special_prefix3 = curr_head + 'sv.baidu.com'
if ori_url.startswith(special_prefix3):
new_url = util_parser.get_value_from_url_info(ori_url, 'log_loc')
if new_url:
return new_url
return None
def normalize_urls(self, info):
"""normalize url, suburl"""
features = info[fd.in_features]
try:
if info[fd.input_data_type] == fd.strategy_type_display:
urlinfos = features[fd.display_urlinfos]
for urlinfo in urlinfos:
urlinfo[fd.urlinfo_url] = util.normalize_url(urlinfo[fd.urlinfo_url])
if fd.urlinfo_child_link in urlinfo:
for idx, sub_url in enumerate(urlinfo[fd.urlinfo_child_link]):
new_sub_url = self.get_keysign(sub_url)
if new_sub_url:
sub_url = new_sub_url
urlinfo[fd.urlinfo_child_link][idx] = util.normalize_url(sub_url)
else:
url = features[fd.newcookie_url]
features[fd.newcookie_url] = util.normalize_url(url)
mu = features[fd.newcookie_reserve3]
features[fd.newcookie_reserve3] = util.normalize_url(mu)
click_url = features[fd.newcookie_click_url]
features[fd.newcookie_click_url] = util.normalize_url(click_url)
if fd.newcookie_sub_url in features:
sub_url = features[fd.newcookie_sub_url]
new_sub_url = self.get_keysign(sub_url)
if new_sub_url:
sub_url = new_sub_url
try:
sub_url = sub_url.decode('utf8').encode('gbk')
except Exception as e:
sub_url = sub_url
features[fd.newcookie_sub_url] = util.normalize_url(sub_url)
util.add_counter('strategy_error', 'decode success QU', 1)
except Exception as e:
util.warn_log('ProcessMergeRound1Reduce: %s' % str(e))
util.add_counter('strategy_error', 'decode failed QU', 1)