Java 热词计算

一、pom引入字典

<dependency>
            <groupId>com.hankcs</groupId>
            <artifactId>hanlp</artifactId>
            <version>portable-1.8.4</version>
        </dependency>

        <!-- 引入 apache commons-compress 依赖 -->
        <dependency>
            <groupId>org.apache.commons</groupId>
            <artifactId>commons-compress</artifactId>
            <version>1.26.0</version>
        </dependency>

二、配置文件添加过滤词

hotword:
  stop-words:
    ------------- 一个
    ----------- 没有
    --- 自己
    ------- 这个
    - 那个
    - 什么
    - 怎么
    - 可以
    ------------------ 一个人
    - 一头
    - 一只

三、读取配置

import lombok.Data;
import org.springframework.boot.context.properties.ConfigurationProperties;
import org.springframework.stereotype.Component;

import java.util.List;

@Data
@Component
@ConfigurationProperties(prefix = "hotword")
public class HotWordProperties {

    /**
     * 停用词列表
     */
    private List<String> stopWords;
}

四、热词处理

import cn.hutool.core.date.DateUtil;
import xxx.config.HotWordProperties;
import com.hankcs.hanlp.HanLP;
import com.hankcs.hanlp.seg.common.Term;
import lombok.extern.slf4j.Slf4j;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.jdbc.core.JdbcTemplate;
import org.springframework.stereotype.Component;

import java.time.YearMonth;
import java.time.format.DateTimeFormatter;
import java.util.*;
import java.util.stream.Collectors;

@Slf4j
@Component
public class HotWordCalculator {

    @Autowired
    private JdbcTemplate jdbcTemplate;

    @Autowired
    private HotWordProperties hotWordProperties;

    private static final DateTimeFormatter MONTH_FORMATTER = DateTimeFormatter.ofPattern("yyyy-MM");

//    private static final Set<String> STOP_WORDS = new HashSet<>(Arrays.asList(
//        "的", "了", "在", "是", "我", "有", "和", "就", "不", "人",
//        "都", "一", "一个", "上", "也", "很", "到", "说", "要", "去",
//        "你", "会", "着", "没有", "看", "好", "自己", "这", "那",
//        "他", "她", "它", "们", "这个", "那个", "什么", "怎么", "可以",
//        "没", "与", "还", "而", "之", "大", "小", "多", "少",
//        "啊", "呀", "呢", "吧", "嘛", "啦", "哦", "嗯", "哎",
//        "一个人", "一头", "一只", "一个"
//    ));

    private Set<String> getStopWords() {
        List<String> stopWords = hotWordProperties.getStopWords();
        return stopWords != null ? new HashSet<>(stopWords) : new HashSet<>();
    }

    public void calculateHotWords(String startDate, String endDate) {
        log.info("开始计算热词,时间范围:{} 至 {}", startDate, endDate);

        String sql = "SELECT A_CONTENT, B_TIME FROM TAB " +
                     "WHERE B_TIME >= TO_DATE(?, 'yyyy-MM-dd') " +
                     "AND B_TIME < TO_DATE(?, 'yyyy-MM-dd')";

        List<Map<String, Object>> events = jdbcTemplate.queryForList(sql, startDate, endDate);
        log.info("共查询到 {} 条事件数据", events.size());

        Map<String, Map<String, Integer>> monthlyHotWords = new HashMap<>();

        for (Map<String, Object> event : events) {
            String content = (String) event.get("A_CONTENT");
            Date registerTime = (Date) event.get("B_TIME");

            if (content == null || registerTime == null) {
                continue;
            }

            String month = DateUtil.format(registerTime, "yyyy-MM");
            List<String> words = extractWords(content);

            monthlyHotWords.computeIfAbsent(month, k -> new HashMap<>());
            Map<String, Integer> wordCount = monthlyHotWords.get(month);

            for (String word : words) {
                wordCount.put(word, wordCount.getOrDefault(word, 0) + 1);
            }
        }

        for (Map.Entry<String, Map<String, Integer>> entry : monthlyHotWords.entrySet()) {
            String month = entry.getKey();
            log.info("++ 处理月份:{}", month);
            Map<String, Integer> wordCount = entry.getValue();

            // 按数量降序排序,相同数量按名称升序排序
            LinkedHashMap<String, Integer> sortedWordCount = wordCount.entrySet()
                    .stream()
                    .sorted(Map.Entry.<String, Integer>comparingByValue().reversed()
                            .thenComparing(Map.Entry.comparingByKey()))
                    .collect(Collectors.toMap(
                            Map.Entry::getKey,
                            Map.Entry::getValue,
                            (e1, e2) -> e1,
                            LinkedHashMap::new
                    ));

            for (Map.Entry<String, Integer> wordEntry : sortedWordCount.entrySet()) {
                String word = wordEntry.getKey();
                Integer num = wordEntry.getValue();

                log.info("{} =》 {}", word, num);
            }
        }

        log.info("热词计算完成");
    }

    private List<String> extractWords(String text) {
        if (text == null || text.trim().isEmpty()) {
            return new ArrayList<>();
        }

        Set<String> stopWords = getStopWords();
        List<Term> terms = HanLP.segment(text);

        return terms.stream()
            .map(term -> term.word.trim())
            .filter(word -> !word.isEmpty())
            .filter(word -> !stopWords.contains(word))
            .filter(word -> word.length() >= 2)
            .filter(word -> !word.matches(".*[0-9a-zA-Z].*") || word.length() > 2)
            .collect(Collectors.toList());
    }
 
}

五、调用

import xx.utils.HotWordCalculator;
import lombok.extern.slf4j.Slf4j;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.boot.CommandLineRunner;
import org.springframework.stereotype.Component;

import javax.annotation.PostConstruct;

@Component
@Slf4j
public class TestJob implements CommandLineRunner {

    @Autowired
    private HotWordCalculator hotWordCalculator;

    @Override
    public void run(String... args) throws Exception {
        log.info("this is => CommandLineRunner");
        hotWordCalculator.calculateHotWords("2026-01-01", "2026-03-30");
    }

    @PostConstruct
    public void test() {
        log.info("this is => PostConstruct");
    }

}

 

posted @ 2026-03-30 17:56  都是城市惹的祸  阅读(0)  评论(0)    收藏  举报