Sphinx-for-Chinese的分词细粒度问题(二)

在Sphinx-for-Chinese的分词细粒度问题中说过，为了解决分词的粒度问题，我们对Sphinx-for-Chinese的代码进行了一些修改，而针对精确匹配我们也写了一些额外的代码，虽然这一部分的代码并不是很好看，但毕竟解决了问题，所以也想对这一部分进行说明，因为相信其他人也会遇到类似的问题，这里可以提供一个参考的解决方案。

所谓精确匹配，也就是搜索的词语搜索的字段完全相同。例如假设有三个标题,中大，中大酒店，中大假日酒店，则搜索中大时，与中大完全匹配。一般情况下，我们都希望精确匹配的内容排在前面，此时还需要设置排序方法为SPH_RANK_SPH04。

依然以sphinx-for-chinese-2.2.1-dev-r4311为例，在sphinxsearch.cpp中6282行附近，找到RankerState_ProximityBM25Exact_fn，这里就是sph04的实现。看到数据成员m_uExactHit，知道这个与精确匹配有关，在这段代码里看到HITMAN::IsEnd，于是猜测在某个地方有SetEnd,在sphinx.cpp中27144行附近找到CSphSource_Document::BuildRegularHits方法，在这里找到了，
CSphWordHit pHit = const_cast < CSphWordHit > ( m_tHits.Last() );
HITMAN::SetEndMarker ( &pHit->m_iWordPos );
于是我们想，在进行细粒度分词时，中大将被分成，中大、中、大三个词。只要有某种办法，将中大这个词也使用SetEndMarker就可以达到所要的目的，于是增加了一些代码。

这之后，搜索中大时，中大这个标题确实排在了前面，可是问题又出现了，在搜索中大酒店时，中大酒店这个标题并没有排在前面，中大酒店与中大假日酒店的权重是相同的。分析了原因，搜索中大酒店时，将被分成中大+酒店，而中大假日酒店中，正好也包含中大和酒店，并且酒店也是排在末尾，于是这两个的权重是一样的。于是我们只好再看看m_uExactHit的计算，发现IsEnd并不是唯一的条件，于是相信为细分以前，索引中大酒店时，分词的词是中大、酒店，而细分后变成了中大、中、大、大酒店、酒店、酒、店，于是我们猜测，如果将分词按照原先的方法分一次，之后再一起返回细粒度的分词，可能可以达到目的。这样的结果就是分词返回的是中大、酒店、中、大、大酒店、酒、店。于是按照这个想法，又增加了一些代码。果然这次搜索中大酒店时，中大酒店排在了前面，并且权重比中大假日酒店高。

Sphinx-for-Chinese的分词细粒度问题解决代码

感觉上，这段代码不贴上来，仿佛欠别人钱似的。趁现在还有些精力，以后很长一段时间都不会接触Sphinx了，赶紧把这件事给做了。
具体为什么这样改，可以看前面的文章。以下修改是基于sphinx-for-chinese-2.2.1-dev-r4311版本，之需要修改sphinx.cpp即可。

在2296行后面添加如下代码：

struct CSphWord
{
    BYTE m_sAccum[3 * SPH_MAX_WORD_LEN + 3];
    int length;
    const BYTE *m_pTokenStart;
    const BYTE *m_pTokenEnd;
};
class ISphWords
{
public:
    int Length () const
    {
    return m_dData.GetLength();
    }

    const CSphWord * First () const
    {
    return m_dData.Begin();
    }

    const CSphWord * Last () const
    {
    return &m_dData.Last();
    }
    void Clean() {
    m_dData.Reset();
    }     

    void AddWord ( BYTE * word, int length, const BYTE *start, const BYTE *end)
    {
        CSphWord & tWord = m_dData.Add();
        memcpy(tWord.m_sAccum, word, length);
        tWord.length = length;
        tWord.m_pTokenStart = start;
        tWord.m_pTokenEnd = end;
    }

public:
    CSphVector<CSphWord> m_dData;
};

在2296行,virtual int GetMaxCodepointLength () const { return m_tLC.GetMaxCodepointLength(); }后面添加如下方法成员：

cvirtual BYTE *          ProcessParsedWord();

在2303行，Darts::DoubleArray::result_pair_type m_pResultPair[256];后面添加如下数据成员：

/*****add by luodongshan for indexer*****/
int totalParsedWordsNum; //总共需要处理的词
int processedParsedWordsNum; //已经处理的词
int isIndexer; //是否开启细粒度分词
bool needMoreParser; //需要更细粒度分词
const char * m_pTempCur;
char  m_BestWord[3 * SPH_MAX_WORD_LEN + 3];
int m_iBestWordLength;
ISphWords m_Words;
CSphWord *current;
bool isParserEnd;

在6448行，m_bHasBlend = false;后面添加如下初始化代码：

char *penv = getenv("IS_INDEX");
if (penv != NULL) {
    isIndexer = 1;
} else {
    isIndexer = 0;
}     
needMoreParser = false;
current = NULL;
isParserEnd = false;

在6743后面添加新增方法成员ProcessParsedWord的实现：

template < bool IS_QUERY >
BYTE * CSphTokenizer_UTF8Chinese<IS_QUERY>::ProcessParsedWord() {
    for (; current != NULL && current <= m_Words.Last(); ) {
    memcpy(m_sAccum, current->m_sAccum, current->length);
    m_pTokenStart = current->m_pTokenStart;
    m_pTokenEnd = current->m_pTokenEnd;
    current++;
    return m_sAccum;
    }
    isParserEnd = false;
    m_Words.Clean();
    current = NULL;
    return NULL;
}

在6785行， bool bGotSoft = false; // hey Beavis he said soft huh huhhuh后面增加如下代码：

if (isIndexer && isParserEnd) { //使用MMSEG分词结束，处理细粒度分词得到的词
    return ProcessParsedWord();
}

在6791行， int iNum;后面增加如下代码：

/***add by dengsl 2014/06/24****/
if(isIndexer && needMoreParser) { //对最优匹配进行细粒度分词
    while (m_pTempCur < m_BestWord + m_iBestWordLength) {
        if(processedParsedWordsNum == totalParsedWordsNum) { //此位置的前缀词已处理完，跳到下一位置
            size_t minWordLength = m_pResultPair[0].length;
            for(int i = 1; i < totalParsedWordsNum; i++) {
                if(m_pResultPair[i].length < minWordLength) {
                    minWordLength = m_pResultPair[i].length;
                }     
            }     
            m_pTempCur += minWordLength;
            m_pText=(Darts::DoubleArray::key_type *)(m_pCur + (m_pTempCur - m_BestWord));
            iNum = m_tDa.commonPrefixSearch(m_pText, m_pResultPair, 256, m_pBufferMax-(m_pCur+(m_pTempCur-m_BestWord)));
            totalParsedWordsNum = iNum;
            processedParsedWordsNum = 0;
        } else {
            iWordLength = m_pResultPair[processedParsedWordsNum].length;
            processedParsedWordsNum++;
            if (m_pTempCur == m_BestWord && iWordLength == m_iBestWordLength) {
                continue;
            }     
            memcpy(m_sAccum, m_pText, iWordLength);
            m_sAccum[iWordLength] = '\0';
            if( 3 * SPH_MAX_WORD_LEN + 3 >= iWordLength + 2) {
                m_sAccum[iWordLength + 1] = '\0';
                if(m_pTokenEnd == m_pBufferMax) { //是结尾，保存结尾符标志
                    m_sAccum[iWordLength + 1] = 1;
                }     
            }     
            m_Words.AddWord(m_sAccum, iWordLength + 2, m_pCur + (m_pTempCur - m_BestWord), m_pCur + (m_pTempCur - m_BestWord) + iWordLength);
        }     
    }     
    m_pCur += m_iBestWordLength;
    needMoreParser = false;
    iWordLength = 0;
    current = const_cast< CSphWord * > ( m_Words.First() );
}     
/***add end by dengsl 2014/06/24****/

在6832行，iNum = m_tDa.commonPrefixSearch(m_pText, m_pResultPair, 256, m_pBufferMax-m_pCur);后面增加如下代码：

/***add by dengsl 2014/06/24****/
if(isIndexer && iNum > 1) {
    m_iBestWordLength=getBestWordLength(m_pText, m_pBufferMax-m_pCur);
    memcpy(m_sAccum, m_pText, m_iBestWordLength);
    m_sAccum[m_iBestWordLength]='\0';
    m_pTokenStart = m_pCur;
    m_pTokenEnd = m_pCur + m_iBestWordLength;

    totalParsedWordsNum = iNum;
    needMoreParser = true;
    processedParsedWordsNum = 0;
    memcpy(m_BestWord, m_pText, m_iBestWordLength);
    m_BestWord[m_iBestWordLength]='\0';
    m_pTempCur = m_BestWord;
    if( 3 * SPH_MAX_WORD_LEN + 3 >= m_iBestWordLength + 2) {
        m_sAccum[m_iBestWordLength + 1] = '\0';
        if(m_pTokenEnd == m_pBufferMax) { //是结尾，保存结尾符标志
            m_sAccum[m_iBestWordLength + 1] = 1;
        }     
    }     
    return m_sAccum;
}     
/***add by dengsl 2014/06/24****/

在6903行，将

return NULL;

修改为

/* dengsl */
isParserEnd = true;
return ProcessParsedWord();

在6914行，将

if_const ( IS_BLEND && !BlendAdjust ( pCur ) )
   return NULL;

修改成：

/* dengsl */
if_const ( IS_BLEND && !BlendAdjust ( pCur ) ) {
    isParserEnd = true;
    return ProcessParsedWord();
}

在27210行，m_tHits.AddHit ( uDocid, iWord, m_tState.m_iHitPos );后面增加如下代码：

///add by luodongshan 20140626
if(sWord != NULL) {
    int sWord_len = strlen((char*)sWord);
    if(sWord_len + 2 <= 3 * SPH_MAX_WORD_LEN + 3 && sWord[sWord_len + 1] == 1 &&
        getenv("IS_INDEX") != NULL && !bSkipEndMarker )  {
    CSphWordHit * pHit = const_cast < CSphWordHit * > ( m_tHits.Last() );
    HITMAN::SetEndMarker ( &pHit->m_iWordPos );

    }     
}     
///add by luodongshan 20140626 end

将过上面的修改，重新编译源码，之后设置环境变量IS_INDEX,即运行export IS_INDEX=1,就可以支持细粒度的划分。

一个需要注意的地方是,对于searchd,也变成细粒度分词了，这并不是我们想要的，所以对于searchd，需要使用未修改代码的searchd.因为我们想建索引时细粒度，搜索时粗粒度。

之所以要这样，是因为如果不这样处理，很多结果会搜出来了。如有文章内容分别为中大酒店，中大假日酒店。如果搜索时也是细粒度，则有中大，酒店，中，大，大酒店，酒，店等查询词，而大酒店只在中大酒店中存在，所以只会搜出中大酒店，这并不是我们想要的。

文章来自：

http://program.dengshilong.org/2014/10/19/Sphinx-for-Chinese%E7%9A%84%E5%88%86%E8%AF%8D%E7%BB%86%E7%B2%92%E5%BA%A6%E9%97%AE%E9%A2%98%E8%A7%A3%E5%86%B3%E4%BB%A3%E7%A0%81/

posted @ 2018-03-26 15:19 丰study 阅读(207) 评论(0) 收藏举报

刷新页面返回顶部

丰study

Sphinx-for-Chinese的分词细粒度问题(二)

Sphinx-for-Chinese的分词细粒度问题解决代码

公告