ProcessMergeRound1Reduce
class Parser4MergeRound1Reduce(BaseStrategy): def __init__(self): BaseStrategy.__init__(self, fd.strategy_type_common, "Parser4MergeRound1Reduce") def run(self, info): line = info[fd.raw_line] items = line.strip().split('\t', 1) if len(items) == 2: info.clear() info_tmp = serialize.un_serialize(items[1]) for key, value in info_tmp.iteritems(): info[key] = value info[fd.sort_key] = items[0] info[fd.raw_line] = line else: return False return True class ProcessMergeRound1Reduce(BaseStrategy): def __init__(self): BaseStrategy.__init__(self, fd.strategy_type_common, "ProcessMergeRound1Reduce") self._curr_qid = '-' self._clear_buffer() def run(self, info): if info[fd.sort_key] != self._curr_qid: self.merge_by_qid() self._clear_buffer() self._curr_qid = info[fd.sort_key] if info[fd.input_data_type] == fd.strategy_type_display: #TODO 性能优化 by susui at 2015-5-12 self._buff = copy.deepcopy(info) #self._buff = info elif info[fd.input_data_type] == fd.strategy_type_click: features = info[fd.in_features] #key = features[fd.newcookie_query] + '\t' + features[fd.newcookie_qid] key = features[fd.newcookie_query] + '\t' + features[fd.search_id] info[fd.raw_line] = '' self.normalize_urls(info) jsonObj = serialize.serialize(info) value = key + '\t' + jsonObj print value return True else: self._curr_search_qid = info[fd.search_id] self._curr_query = info[fd.newcookie_query] #added by zhangtongtong if fd.file_name in info: self._se_file_name = info[fd.file_name] if fd.line_index in info: self._se_line_index = info[fd.line_index] return False return False def merge_by_qid(self): if len(self._buff) < 3: return False if self._curr_query == '': return False features = self._buff[fd.in_features] if self._curr_search_qid == '-': features[fd.display_qid] = self._curr_qid else: features[fd.display_qid] = self._curr_search_qid #added by zhangtongtong self._buff[fd.file_name] = self._se_file_name self._buff[fd.line_index] = self._se_line_index + 1 key = features[fd.display_query] + '\t' + features[fd.display_qid] self._buff[fd.raw_line] = '' features[fd.display_raw_urlinfos] = [] self.normalize_urls(self._buff) jsonObj = serialize.serialize(self._buff) value = key + '\t' + jsonObj if value == '': return False else: print value return True def done(self, info): self.merge_by_qid() def _clear_buffer(self): self._buff = {} self._curr_search_qid = '-' self._curr_query = '' #added by zhangtongtong self._se_file_name = '-' self._se_line_index = 0 def get_keysign(self, ori_url): curr_head = '' all_heads = ['http://', 'https://'] for head in all_heads: if ori_url.startswith(head): curr_head = head break special_prefix1 = curr_head + 'm.baidu.com' if ori_url.startswith(special_prefix1): new_url = util_parser.get_value_from_url_info(ori_url, 'keysign') if new_url: return new_url special_prefix2 = curr_head + 'm5.baidu.com' special_prefix_sf = curr_head + '/sf?pd=video_page' if ori_url.startswith(special_prefix1) or \ ori_url.startswith(special_prefix2) or \ ori_url.startswith(special_prefix_sf): ori_url = util.normalize_url(ori_url) ext = util_parser.get_value_from_url_info(ori_url, 'ext') if ext: ext_dict = json.loads(ext) new_url = ext_dict.get('log_loc', '') if new_url: return new_url special_prefix3 = curr_head + 'sv.baidu.com' if ori_url.startswith(special_prefix3): new_url = util_parser.get_value_from_url_info(ori_url, 'log_loc') if new_url: return new_url return None def normalize_urls(self, info): """normalize url, suburl""" features = info[fd.in_features] try: if info[fd.input_data_type] == fd.strategy_type_display: urlinfos = features[fd.display_urlinfos] for urlinfo in urlinfos: urlinfo[fd.urlinfo_url] = util.normalize_url(urlinfo[fd.urlinfo_url]) if fd.urlinfo_child_link in urlinfo: for idx, sub_url in enumerate(urlinfo[fd.urlinfo_child_link]): new_sub_url = self.get_keysign(sub_url) if new_sub_url: sub_url = new_sub_url urlinfo[fd.urlinfo_child_link][idx] = util.normalize_url(sub_url) else: url = features[fd.newcookie_url] features[fd.newcookie_url] = util.normalize_url(url) mu = features[fd.newcookie_reserve3] features[fd.newcookie_reserve3] = util.normalize_url(mu) click_url = features[fd.newcookie_click_url] features[fd.newcookie_click_url] = util.normalize_url(click_url) if fd.newcookie_sub_url in features: sub_url = features[fd.newcookie_sub_url] new_sub_url = self.get_keysign(sub_url) if new_sub_url: sub_url = new_sub_url try: sub_url = sub_url.decode('utf8').encode('gbk') except Exception as e: sub_url = sub_url features[fd.newcookie_sub_url] = util.normalize_url(sub_url) util.add_counter('strategy_error', 'decode success QU', 1) except Exception as e: util.warn_log('ProcessMergeRound1Reduce: %s' % str(e)) util.add_counter('strategy_error', 'decode failed QU', 1)