【LeetCode 3087. 3103. 查找热门话题标签】[MySQL/PostgresSQL/T-SQL/Oracle]SQL抽取出推文中的所有hashtag
题目地址
https://leetcode.cn/problems/find-trending-hashtags/description/ 简单版本。每条推文只包含一个标签。
https://leetcode.cn/problems/find-trending-hashtags-ii/ 普通版本。一条推文可能含有多个标签。
思路
如果用Python的话,re.findall就可以做。
这里拿SQL做一下。
代码
MySQL
To extract a hashtag, locate every instance of the "#" symbol, then extend to the right until you reach a space or a newline. The text captured between the "#" and the delimiter is your hashtag.
WITH RECURSIVE numbers AS (
SELECT 1 AS n
UNION ALL
SELECT n + 1
FROM numbers
WHERE n < 100 -- Adjust based on max expected hashtags or characters
),
hashtag_positions AS (
SELECT
tweet_id,
tweet,
n,
LOCATE('#', tweet, n) AS hashtag_start
FROM Tweets
JOIN numbers
WHERE tweet_date BETWEEN '2024-02-01' AND '2024-02-29'
AND tweet LIKE '%#%'
AND LOCATE('#', tweet, n) > 0 -- Only keep rows where '#' is found
),
hashtags_extracted AS (
SELECT
tweet_id,
hashtag_start,
SUBSTRING(
tweet,
hashtag_start,
LEAST(
IF(LOCATE(' ', tweet, hashtag_start) > 0,
LOCATE(' ', tweet, hashtag_start),
LENGTH(tweet) + 1),
IF(LOCATE('\n', tweet, hashtag_start) > 0,
LOCATE('\n', tweet, hashtag_start),
LENGTH(tweet) + 1)
) - hashtag_start
) AS hashtag
FROM hashtag_positions
)
,
temp1 as
(
SELECT distinct * from hashtags_extracted
)
,
t1 AS (
SELECT
hashtag,
COUNT(*) AS count
FROM temp1
WHERE hashtag LIKE '#%'
GROUP BY hashtag
ORDER BY count DESC, hashtag DESC
LIMIT 3
)
SELECT * FROM t1;
PostgresSQL
WITH hashtag_positions AS (
SELECT
tweet_id,
tweet,
n,
POSITION('#' IN SUBSTRING(tweet FROM n)) + n - 1 AS hashtag_start
FROM Tweets
CROSS JOIN GENERATE_SERIES(1, 100) AS n -- Adjust 100 based on max expected hashtags or characters
WHERE tweet_date BETWEEN '2024-02-01' AND '2024-02-29'
AND tweet LIKE '%#%'
AND POSITION('#' IN SUBSTRING(tweet FROM n)) > 0 -- Only keep rows where '#' is found
),
hashtags_extracted AS (
SELECT
tweet_id,
hashtag_start,
SUBSTRING(
tweet
FROM hashtag_start
FOR LEAST(
COALESCE(NULLIF(POSITION(' ' IN SUBSTRING(tweet FROM hashtag_start)), 0), LENGTH(tweet) + 1),
COALESCE(NULLIF(POSITION(E'\n' IN SUBSTRING(tweet FROM hashtag_start)), 0), LENGTH(tweet) + 1)
) - 1
) AS hashtag
FROM hashtag_positions
),
temp1 AS (
SELECT DISTINCT * FROM hashtags_extracted
),
t1 AS (
SELECT
hashtag,
COUNT(*) AS count
FROM temp1
WHERE hashtag LIKE '#%'
GROUP BY hashtag
ORDER BY count DESC, hashtag DESC
LIMIT 3
)
SELECT * FROM t1;
T-SQL
WITH numbers AS (
SELECT n = 1
UNION ALL
SELECT n + 1
FROM numbers
WHERE n < 100 -- Adjust based on max expected hashtags or characters
),
hashtag_positions AS (
SELECT
tweet_id,
tweet,
n,
CHARINDEX('#', tweet, n) AS hashtag_start
FROM Tweets
CROSS JOIN numbers
WHERE tweet_date BETWEEN '2024-02-01' AND '2024-02-29'
AND tweet LIKE '%#%'
AND CHARINDEX('#', tweet, n) > 0 -- Only keep rows where '#' is found
),
hashtags_extracted AS (
SELECT
tweet_id,
hashtag_start,
SUBSTRING(
tweet,
hashtag_start,
CASE
WHEN CHARINDEX(' ', tweet, hashtag_start) > 0
AND CHARINDEX(CHAR(10), tweet, hashtag_start) > 0
THEN IIF(
CHARINDEX(' ', tweet, hashtag_start) < CHARINDEX(CHAR(10), tweet, hashtag_start),
CHARINDEX(' ', tweet, hashtag_start),
CHARINDEX(CHAR(10), tweet, hashtag_start)
)
WHEN CHARINDEX(' ', tweet, hashtag_start) > 0
THEN CHARINDEX(' ', tweet, hashtag_start)
WHEN CHARINDEX(CHAR(10), tweet, hashtag_start) > 0
THEN CHARINDEX(CHAR(10), tweet, hashtag_start)
ELSE LEN(tweet) + 1
END - hashtag_start
) AS hashtag
FROM hashtag_positions
),
temp1 AS (
SELECT DISTINCT * FROM hashtags_extracted
),
t1 AS (
SELECT
hashtag,
COUNT(*) AS count
FROM temp1
WHERE hashtag LIKE '#%'
GROUP BY hashtag
ORDER BY count DESC, hashtag DESC
OFFSET 0 ROWS FETCH NEXT 3 ROWS ONLY
)
SELECT * FROM t1;
Oracle
WITH numbers AS (
SELECT LEVEL AS n
FROM dual
CONNECT BY LEVEL <= 100 -- Adjust based on max expected hashtags or characters
),
hashtag_positions AS (
SELECT
t.tweet_id,
t.tweet,
n.n,
INSTR(t.tweet, '#', n.n) AS hashtag_start
FROM Tweets t
JOIN numbers n
ON INSTR(t.tweet, '#', n.n) > 0
WHERE t.tweet_date BETWEEN TO_DATE('2024-02-01', 'YYYY-MM-DD') AND TO_DATE('2024-02-29', 'YYYY-MM-DD')
AND t.tweet LIKE '%#%'
),
hashtags_extracted AS (
SELECT
tweet_id,
hashtag_start,
SUBSTR(
tweet,
hashtag_start,
LEAST(
CASE WHEN INSTR(tweet, ' ', hashtag_start) > 0 THEN
INSTR(tweet, ' ', hashtag_start)
ELSE
LENGTH(tweet) + 1
END,
CASE WHEN INSTR(tweet, CHR(10), hashtag_start) > 0 THEN
INSTR(tweet, CHR(10), hashtag_start)
ELSE
LENGTH(tweet) + 1
END
) - hashtag_start
) AS hashtag
FROM hashtag_positions
),
temp1 AS (
SELECT DISTINCT * FROM hashtags_extracted
),
t1 AS (
SELECT
hashtag,
COUNT(*) AS count
FROM temp1
WHERE hashtag LIKE '#%'
GROUP BY hashtag
ORDER BY count DESC, hashtag DESC
)
SELECT * FROM t1
WHERE ROWNUM <= 3;

浙公网安备 33010602011771号