搜索引擎模型

1 爬虫模块
2 索引模块
采用二元分词存储
3 搜索模块
3.1 asp.net界面
3.2 搜索方法
    private void Search()
    
{
        
//int startAt, len;
        string searchStr = this.Q;
        
string prefix = this.T;
        SearchTest searcher 
= new SearchTest();
        DateTime start 
= DateTime.Now;

        
// create the result DataTable
        this.Results.Columns.Add("title"typeof(string));
        
this.Results.Columns.Add("content"typeof(string));
        
this.Results.Columns.Add("url"typeof(string));

        
if ((searchStr.IndexOf(" "== -1)&&searchStr.Length>3)
        
{
            List
<string> resultList = Sj110.Com.Chinese.Tokenizer.Tokenize(searchStr);
            StringBuilder sb 
= new StringBuilder();
            
foreach (string result in resultList)
            
{
                
bool bStop=false;
                
foreach (string stop in m_stopWords)
                    
if (result == stop)
                    

                        bStop 
= true;
                        
break;
                    }


                
if (bStop == false)
                
{
                    sb.Append(result);
                    sb.Append(
" ");
                }


                
//sb.AppendFormat("{0} ", result);
            }

            sb.Remove(sb.Length 
- 11);
            searchStr 
= sb.ToString();
        }


        
try
        
{
            
string[] fields = "content""title" };
            
//Hits h = searcher.search(searchStr, fields, prefix);

            
//Hits h = searcher.search(searchStr, "content");
            Hits h = searcher.search(searchStr, prefix);
            
//this.m_total = h.Length();
            this.m_total = GetValidLength(h);
            
// initialize startAt
            this.m_startAt = initStartAt();

            
// how many items we should show - less than defined at the end of the results
            int resultsCount = smallerOf(m_total, this.m_maxResults + this.m_startAt);
            
// create highlighter

            
if (h.Length() == 0)
            
{
                DataRow row 
= this.Results.NewRow();
                row[
"title"= "您查询的关键字<font color=CC0033>" + searchStr + "</font>暂无结果。<br><br>提示:多个关键字之间请加空格。“<font color=black>公交 线路</font>”比“<font color=black>公交线路</font>”更容易搜到结果。";
                row[
"url"= "default.aspx";
                
this.Results.Rows.Add(row);
                
return;
            }

            
for (int i = m_startAt; i < resultsCount; i++)
            
{
                Document doc 
= h.Doc(i);

                
string url = doc.Get("url");
                
//if (url == m_oldUrl||url.EndsWith("/"))
                if (m_oldUrls.CheckRepeatUrl(url) || url.EndsWith("/"))
                
{
                    m_invalidCount
++;
                    resultsCount
++;
                    
continue;
                }

                
//m_oldUrl = url;
                string content = doc.Get("content");                
                
string title = doc.Get("title");
                
if (title.Trim() == "") title = "无标题";

                String[] searchArr 
= searchStr.Split(' ');

                
//startAt = content.IndexOf(searchArr[0]);
                
//startAt = startAt - 20;
                
//startAt = (startAt < 0 ? 0 : startAt);
                
//len = (startAt + 255 > content.Length ? content.Length - startAt : 255);
                
//content = content.Substring(startAt, len);
                content = GetBestFragments(content, searchArr);
                content 
= Hilighter(content, searchArr);
                title 
= Hilighter(title, searchArr);
                DataRow row 
= this.Results.NewRow();

                row[
"title"= title;
                row[
"content"= content;
                row[
"url"= url;

                
this.Results.Rows.Add(row);
                
            }

            
// result information
            this.m_duration = DateTime.Now - start;
            
this.m_fromItem = this.m_startAt + 1;
            
this.m_toItem = smallerOf(this.m_startAt + m_maxResults, m_total);

        }

        
catch (Exception ex)
        
{
            Console.WriteLine(ex.Message);
            
//throw;
            return;
        }

    }

   
posted @ 2007-08-06 16:05  Jadepark  阅读(1036)  评论(1编辑  收藏  举报