c# 文件筛选

 

调用方法:

 FSearch _FSearch = new FSearch();
                List<string> o_Rtn = new List<string>();


                string _foldPath = textBox1.Text;
                string _searckkey = textBox2.Text;


                string foldPath = _foldPath;
                if (string.IsNullOrEmpty(foldPath))
                {

                    MessageBox.Show("请选择文件路径!");
                    return;
                }
                string searckkey = _searckkey;
                if (string.IsNullOrEmpty(searckkey))
                {
                    MessageBox.Show("请输入搜索关键字!");
                    return;
                }



                _FSearch.SelectFile(foldPath, searckkey, ref o_Rtn);

                string Msg= string.Format("查询到符合条件的文档份数:{0}", o_Rtn.Count);

  

 

 

 

代码

    /// <summary>
    /// 文件内容检索
    /// </summary>
    public class FSearch
    {
        /// <summary>
        /// 文件路径
        /// </summary>
        string _foldPath = string.Empty;

        /// <summary>
        /// 检索关键字
        /// </summary>
        string _searckkey = string.Empty;

        string _lbmessage = string.Empty;


        string _comboBox2 = string.Empty;

        List<string> _dataGridView1 = new List<string>();

        /// <summary>
        /// 文件默认类型
        /// </summary>
        string extension = ".DOC.DOCX.XLS.XLSX.PPT.PPTX.PDF.HTML.HTM.TXT";


        /// <summary>
        /// 日志对象
        /// </summary>
        private static readonly ILog log = LogManager.GetLogger(typeof(FSearch));


        /// <summary>
        /// 筛选文件
        /// </summary>
        /// <param name="i_foldPath">选择文件夹</param>
        /// <param name="i_searckkey">筛选关键字</param>
        public void SelectFile(string i_foldPath, string i_searckkey, ref List<string> o_Rtn)
        {
            try
            {
                _foldPath = i_foldPath;
                _searckkey = i_searckkey;



                string foldPath = _foldPath;
                if (string.IsNullOrEmpty(foldPath))
                {

                    log.Error("请选择文件路径!");
                    return;
                }
                string searckkey = _searckkey;
                if (string.IsNullOrEmpty(searckkey))
                {
                    log.Error("请输入搜索关键字!");
                    return;
                }


                _dataGridView1.Clear();
                listDirectory(@foldPath, extension, _comboBox2, searckkey);


                o_Rtn = _dataGridView1;
                _lbmessage = "";
                log.Error("搜索完毕");
            }
            catch (Exception err)
            {
                log.Error(err.Message);
            }
        }

        /// <summary>
        /// 列出path路径对应的文件夹中的子文件夹和文件
        /// 然后再递归列出子文件夹内的文件和文件夹
        /// </summary>
        /// <param name="path">需要搜索文件夹的路径</param>
        public void listDirectory(string path, string extension, string coding, string searckkey)
        {
            DirectoryInfo theFolder = new DirectoryInfo(@path);
            DirectorySecurity s = new DirectorySecurity(path, AccessControlSections.Access);
            //判断目录是否 可以访问  
            if (!s.AreAccessRulesProtected)
            {

                foreach (FileInfo file in theFolder.GetFiles())
                {
                    if (string.IsNullOrEmpty(extension) || extension.Contains(file.Extension.ToUpper()))
                    {
                        _lbmessage = "正在搜索文件:" + path + "\\" + file.Name;
                        Application.DoEvents();

                        #region 标题和内容都检索

                        #region 检索判断标题
                        //默认检索 先搜索标题是否有,如果有,则退出循环,如果没有,再检索内容
                        if (file.Name.Contains(searckkey))
                        {
                            _dataGridView1.Add(path + "\\" + file.Name);
                            continue;
                        }
                        #endregion

                        #region 检索文档内容

                        try
                        {


                            using (FileStream fs = new FileStream(path + "\\" + file.Name, FileMode.Open, FileAccess.Read))
                            {
                                #region 读取Execl

                                if (file.Extension.ToUpper().Contains(".XLS"))
                                {
                                    try
                                    {
                                        IWorkbook workbook = null;//全局workbook
                                        ISheet sheet;//sheet
                                        switch (file.Extension)
                                        {
                                            //xls是03,用HSSFWorkbook打开,.xlsx是07或者10用XSSFWorkbook打开
                                            case ".xls": workbook = new HSSFWorkbook(fs); break;
                                            case ".xlsx": workbook = new XSSFWorkbook(fs); break;
                                            default: break;
                                        }
                                        fs.Close();//关闭文件流
                                        if (workbook != null)
                                        {
                                            int count = workbook.NumberOfSheets;
                                            bool bo = false; //bo初始化为假
                                            for (int index = 0; index < count; index++)
                                            {
                                                if (bo)//如果bo为真
                                                    break;//退出第一层循环
                                                sheet = workbook.GetSheetAt(index);//读取到指定的sheet
                                                                                   //遍历读取cell
                                                for (int i = sheet.FirstRowNum; i <= sheet.LastRowNum; i++)
                                                {
                                                    if (bo)//如果bo为真
                                                        break;//退出第二层循环
                                                    IRow row = sheet.GetRow(i);//得到一行
                                                    if (row != null)
                                                    {
                                                        for (int j = row.FirstCellNum; j < row.LastCellNum; j++)
                                                        {
                                                            ICell cell = row.GetCell(j);//得到cell
                                                            if (cell != null)//如果cell为null,则赋值为空
                                                            {
                                                                if (row.GetCell(j).ToString().Contains(searckkey))
                                                                {
                                                                    _dataGridView1.Add(path + "\\" + file.Name);
                                                                    bo = true;//bo赋为真
                                                                    break;//退出第三层循环
                                                                }
                                                            }
                                                        }
                                                    }
                                                }
                                            }
                                        }
                                        //释放资源
                                        workbook = null;
                                        sheet = null;
                                    }
                                    catch (Exception err)
                                    {
                                        //MessageBox.Show(err.Message);
                                    }
                                }
                                #endregion

                                #region 读取ppt内容
                                else if (file.Extension.ToUpper().Contains(".PPT"))
                                {
                                    //try
                                    //{
                                    //    //初始化一个Presentation类实例,并加载文档
                                    //    Presentation ppt = new Presentation();
                                    //    ppt.LoadFromFile(path + "\\" + file.Name);
                                    //    bool bo = false;
                                    //    foreach (ISlide slide in ppt.Slides)
                                    //    {
                                    //        if (bo)//如果bo为真
                                    //            break;//退出第一层循环
                                    //        foreach (Spire.Presentation.IShape shape in slide.Shapes)
                                    //        {
                                    //            if (bo)//如果bo为真
                                    //                break;//退出第一层循环
                                    //            if (shape is IAutoShape)
                                    //            {
                                    //                foreach (TextParagraph tp in (shape as IAutoShape).TextFrame.Paragraphs)
                                    //                {
                                    //                    if (tp.Text.Contains(searckkey))
                                    //                    {
                                    //                        int GridIndex = this._dataGridView1.Add();
                                    //                        this._dataGridView1[GridIndex].Cells[0].Value = GridIndex + 1;
                                    //                        this._dataGridView1[GridIndex].Cells[1].Value = file.Name;
                                    //                        this._dataGridView1[GridIndex].Cells[2].Value = path + "\\" + file.Name;
                                    //                        bo = true;//bo赋为真
                                    //                        break;//退出第三层循环
                                    //                    }
                                    //                }
                                    //            }

                                    //        }
                                    //    }
                                    //    ppt = null; //释放资源
                                    //}
                                    //catch (Exception err)
                                    //{
                                    //    //MessageBox.Show(err.Message);
                                    //}
                                }
                                #endregion

                                #region 读取pdf文件
                                else if (file.Extension.ToUpper().Contains(".PDF"))
                                {
                                    try
                                    {
                                        PdfDocument pdf = new PdfDocument();
                                        pdf.LoadFromFile(@path + "\\" + file.Name);
                                        foreach (PdfPageBase page in pdf.Pages)
                                        {
                                            string content = page.ExtractText();
                                            if (content.Contains(searckkey))
                                            {

                                                _dataGridView1.Add(path + "\\" + file.Name);
                                                continue;
                                            }
                                        }
                                        pdf = null;//释放资源
                                    }
                                    catch (Exception err)
                                    {

                                    }
                                }
                                #endregion

                                #region doc
                                else if (file.Extension.ToUpper().Contains(".DOC"))
                                {
                                    try
                                    {


                                        //我还要打开这个文档玩玩
                                        MSWord.Application app = new MSWord.Application();
                                        MSWord.Document doc = null;

                                        object _file = path + "\\" + file.Name;
                                        doc = app.Documents.Open(ref _file);

                                        string text2 = Regex.Replace(doc.Content.Text, @"(\\a|\\t|\\n|\\s+)", "");

                                        if (text2.Contains(searckkey))
                                        {
                                            _dataGridView1.Add(path + "\\" + file.Name);
                                            fs.Close();
                                        }

                                        doc.Close();
                                        app.Quit();
                                    }
                                    catch (Exception err)
                                    {

                                    }
                                }
                                #endregion

                                #region 读取其他文本文件
                                else
                                {

                                    Encoding codingType = Encoding.Default;
                                    codingType = GetType(path + "\\" + file.Name);  //get encode from document . 


                                    StreamReader sr = new StreamReader(fs, codingType);
                                    String str;
                                    while ((str = sr.ReadLine()) != null)
                                    {
                                        if (str.Contains(searckkey))
                                        {
                                            _dataGridView1.Add(path + "\\" + file.Name);
                                            sr.Close();
                                            fs.Close();
                                            break;
                                        }
                                    }
                                }
                                #endregion

                            }


                        }
                        catch (Exception ex)
                        {

                            log.ErrorFormat("【{0}】文档解析异常:{1}",(path + "\\" + file.Name),ex.Message);
                            continue;
                        }


                        #endregion

                        #endregion
                    }
                }
            }
            //遍历文件夹
            foreach (DirectoryInfo NextFolder in theFolder.GetDirectories())
            {
                if ((NextFolder.Attributes & FileAttributes.Hidden) != FileAttributes.Hidden)
                {
                    listDirectory(NextFolder.FullName, extension, coding, searckkey);
                }
            }
        }


        
        #region 共通方法


        //编码问题目前为止,基本上没人解决,就连windows的IE的自动识别有时还识别错编码呢
        //如果文件有BOM则判断,如果没有就用系统默认编码,缺点:没有BOM的非系统编码文件会显示乱码。   
        //调用方法: common.GetType(filename)    
        public  System.Text.Encoding GetType(string FILE_NAME)
        {
            using (FileStream fs = new FileStream(FILE_NAME, FileMode.Open, FileAccess.Read))
            {
                System.Text.Encoding r = GetType(fs);
                fs.Close();
                return r;
            }
        }
        /// <summary> 
        /// 通过给定的文件流,判断文件的编码类型 
        /// </summary> 
        /// <param name="fs">文件流</param> 
        /// <returns>文件的编码类型</returns> 
        public  System.Text.Encoding GetType(FileStream fs)
        {
            byte[] Unicode = new byte[] { 0xFF, 0xFE, 0x41 };
            byte[] UnicodeBIG = new byte[] { 0xFE, 0xFF, 0x00 };
            byte[] UTF8 = new byte[] { 0xEF, 0xBB, 0xBF }; //带BOM 
            Encoding reVal = Encoding.Default;

            BinaryReader r = new BinaryReader(fs, System.Text.Encoding.Default);
            int i;
            int.TryParse(fs.Length.ToString(), out i);
            byte[] ss = r.ReadBytes(i);
            if (IsUTF8Bytes(ss) || (ss[0] == 0xEF && ss[1] == 0xBB && ss[2] == 0xBF))
            {
                reVal = Encoding.UTF8;
            }
            else if (ss[0] == 0xFE && ss[1] == 0xFF && ss[2] == 0x00)
            {
                reVal = Encoding.BigEndianUnicode;
            }
            else if (ss[0] == 0xFF && ss[1] == 0xFE && ss[2] == 0x41)
            {
                reVal = Encoding.Unicode;
            }
            r.Close();
            return reVal;

        }

        /// <summary> 
        /// 判断是否是不带 BOM 的 UTF8 格式 
        /// </summary> 
        /// <param name=“data“></param> 
        /// <returns></returns> 
        private bool IsUTF8Bytes(byte[] data)
        {
            int charByteCounter = 1; //计算当前正分析的字符应还有的字节数 
            byte curByte; //当前分析的字节. 
            for (int i = 0; i < data.Length; i++)
            {
                curByte = data[i];
                if (charByteCounter == 1)
                {
                    if (curByte >= 0x80)
                    {
                        //判断当前 
                        while (((curByte <<= 1) & 0x80) != 0)
                        {
                            charByteCounter++;
                        }
                        //标记位首位若为非0 则至少以2个1开始 如:110XXXXX...........1111110X 
                        if (charByteCounter == 1 || charByteCounter > 6)
                        {
                            return false;
                        }
                    }
                }
                else
                {
                    //若是UTF-8 此时第一位必须为1 
                    if ((curByte & 0xC0) != 0x80)
                    {
                        return false;
                    }
                    charByteCounter--;
                }
            }
            if (charByteCounter > 1)
            {
                throw new Exception("非预期的byte格式");
            }
            return true;
        }

        #endregion

    }

 

posted @ 2021-02-03 11:56  人生为卒  阅读(463)  评论(0编辑  收藏  举报