ProcessMergeRound1Reduce

class Parser4MergeRound1Reduce(BaseStrategy):
 def __init__(self):
 BaseStrategy.__init__(self, fd.strategy_type_common, "Parser4MergeRound1Reduce")

    def run(self, info):
        line = info[fd.raw_line]
        items = line.strip().split('\t', 1)
        if len(items) == 2:
            info.clear()
            info_tmp = serialize.un_serialize(items[1])
            for key, value in info_tmp.iteritems():
                info[key] = value
            info[fd.sort_key] = items[0]
            info[fd.raw_line] = line
        else:
            return False
        return True


class ProcessMergeRound1Reduce(BaseStrategy):
    def __init__(self):
        BaseStrategy.__init__(self, fd.strategy_type_common, "ProcessMergeRound1Reduce")
        self._curr_qid = '-'
        self._clear_buffer()

    def run(self, info):
        if info[fd.sort_key] != self._curr_qid:
            self.merge_by_qid()
            self._clear_buffer()
        self._curr_qid = info[fd.sort_key]
        if info[fd.input_data_type] == fd.strategy_type_display:
            #TODO 性能优化 by susui at 2015-5-12
            self._buff = copy.deepcopy(info)
            #self._buff = info
        elif info[fd.input_data_type] == fd.strategy_type_click:
            features = info[fd.in_features]
            #key = features[fd.newcookie_query] + '\t' + features[fd.newcookie_qid]
            key = features[fd.newcookie_query] + '\t' + features[fd.search_id]
            info[fd.raw_line] = ''
            self.normalize_urls(info)
            jsonObj = serialize.serialize(info)
            value = key + '\t' + jsonObj
            print value
            return True
        else:
            self._curr_search_qid = info[fd.search_id]
            self._curr_query = info[fd.newcookie_query]
            #added by zhangtongtong
            if fd.file_name in info:
                self._se_file_name = info[fd.file_name]
            if fd.line_index in info:
                self._se_line_index = info[fd.line_index]
            return False
        return False

    def merge_by_qid(self):
        if len(self._buff) < 3:
            return False
        if self._curr_query == '':
            return False
        features = self._buff[fd.in_features]
        if self._curr_search_qid == '-':
            features[fd.display_qid] = self._curr_qid
        else:
            features[fd.display_qid] = self._curr_search_qid
        #added by zhangtongtong
        self._buff[fd.file_name] = self._se_file_name
        self._buff[fd.line_index] = self._se_line_index + 1

        key = features[fd.display_query] + '\t' + features[fd.display_qid]
        self._buff[fd.raw_line] = ''
        features[fd.display_raw_urlinfos] = []
        self.normalize_urls(self._buff)
        jsonObj = serialize.serialize(self._buff)
        value = key + '\t' + jsonObj
        if value == '':
            return False
        else:
            print value
            return True

    def done(self, info):
        self.merge_by_qid()

    def _clear_buffer(self):
        self._buff = {}
        self._curr_search_qid = '-'
        self._curr_query = ''
        #added by zhangtongtong
        self._se_file_name = '-'
        self._se_line_index = 0

    def get_keysign(self, ori_url):
        curr_head = ''
        all_heads = ['http://', 'https://']
        for head in all_heads:
            if ori_url.startswith(head):
                curr_head = head
                break
        special_prefix1 = curr_head + 'm.baidu.com'
        if ori_url.startswith(special_prefix1):
            new_url = util_parser.get_value_from_url_info(ori_url, 'keysign')
            if new_url:
                return new_url
        special_prefix2 = curr_head + 'm5.baidu.com'
        special_prefix_sf = curr_head + '/sf?pd=video_page'
        if ori_url.startswith(special_prefix1) or \
                ori_url.startswith(special_prefix2) or \
                ori_url.startswith(special_prefix_sf):
            ori_url = util.normalize_url(ori_url)
            ext = util_parser.get_value_from_url_info(ori_url, 'ext')
            if ext:
                ext_dict = json.loads(ext)
                new_url = ext_dict.get('log_loc', '')
                if new_url:
                    return new_url
        special_prefix3 = curr_head + 'sv.baidu.com'
        if ori_url.startswith(special_prefix3):
            new_url = util_parser.get_value_from_url_info(ori_url, 'log_loc')
            if new_url:
                return new_url
        return None

    def normalize_urls(self, info):
        """normalize url, suburl"""
        features = info[fd.in_features]
        try:
            if info[fd.input_data_type] == fd.strategy_type_display:
                urlinfos = features[fd.display_urlinfos]
                for urlinfo in urlinfos:
                    urlinfo[fd.urlinfo_url] = util.normalize_url(urlinfo[fd.urlinfo_url])
                    if fd.urlinfo_child_link in urlinfo:
                        for idx, sub_url in enumerate(urlinfo[fd.urlinfo_child_link]):
                            new_sub_url = self.get_keysign(sub_url)
                            if new_sub_url:
                                sub_url = new_sub_url
                            urlinfo[fd.urlinfo_child_link][idx] = util.normalize_url(sub_url)
            else:
                url = features[fd.newcookie_url]
                features[fd.newcookie_url] = util.normalize_url(url)
                mu = features[fd.newcookie_reserve3]
                features[fd.newcookie_reserve3] = util.normalize_url(mu)
                click_url = features[fd.newcookie_click_url]
                features[fd.newcookie_click_url] = util.normalize_url(click_url)

                if fd.newcookie_sub_url in features:
                    sub_url = features[fd.newcookie_sub_url]
                    new_sub_url = self.get_keysign(sub_url)
                    if new_sub_url:
                        sub_url = new_sub_url
                    try:
                        sub_url = sub_url.decode('utf8').encode('gbk')
                    except Exception as e:
                        sub_url = sub_url
                    features[fd.newcookie_sub_url] = util.normalize_url(sub_url)
            util.add_counter('strategy_error', 'decode success QU', 1)
        except Exception as e:
            util.warn_log('ProcessMergeRound1Reduce: %s' % str(e))
            util.add_counter('strategy_error', 'decode failed QU', 1)

 

 

posted @ 2021-08-30 17:29  dsfsadfdgd  阅读(21)  评论(0)    收藏  举报