文件查找

项目功能描述

读取windows系统上的所有文件，根据输入的关键字找到包含关键字的文件

目的

了解文件的编码
多线程读取文件
多线程分段读取单个文件

实现思路

1、遍历操作系统上的所有盘符,找到所有文件,把找到的文件路径存到集合中

private ConcurrentQueue<string> _files = new ConcurrentQueue<string>();

 Task task = new Task(() =>
            {
                foreach (string drive in Environment.GetLogicalDrives())
                {
                    var dir = new DriveInfo(drive);
                    switch (dir.DriveType)
                    {
                        case DriveType.Fixed:
                            {
                                string name = dir.Name;
                                SetLog($"根路径{name}");
                                string[] files = Directory.GetFiles(name);
                                foreach (string file in files)
                                {
                                    _files.Enqueue(file);
                                }
                                string[] dics = Directory.GetDirectories(name);
                                SearchDic(dics);
                            }
                            break;
                    }
                }
                _isSearchDicEnd = false;
                SetLog("查找结束");
            });
            task.Start();

  private void SearchDic(string[] dics)
        {
            foreach (var item in dics)
            {
                SetLog($"文件夹:{ new DirectoryInfo(item).FullName }");
                try
                {
                    string[] files = Directory.GetFiles(item);
                    foreach (string file in files)
                    {
                        _files.Enqueue(file);
                    }
                    string[] newDics = Directory.GetDirectories(item);
                    SearchDic(newDics);
                }
                catch (Exception)
                {

                }
            }
        }

2、创建指定数量的线程消费集合中的文件路径

  private int _maxFileTaskCount = 10;

    private void HandleFile()
        {
            for (var i = 0; i < _maxFileTaskCount; i++)
            {
                Task task = new Task(() =>
                {
                    while (true)
                    {
                        string file = string.Empty;
                        bool isSuccess = false;
                        while (!isSuccess)
                        {
                            isSuccess = _files.TryDequeue(out file);
                            if (!isSuccess)
                            {
                                if (_isSearchDicEnd)
                                {
                                    return;
                                }
                                Thread.Sleep(100);
                            }
                        }
                        JudgeFile(file);
                    }
                });
                task.Start();
            }
        }

3、消费文件路径

1、判断文件的编码方式(c#总共考虑的文件编码方式有:Encoding.Default、Encoding.BigEndianUnicode、Encoding.Unicode、Encoding.UTF8)

判断文件有bom,则直接根据bom判断文件的编码方式

 private static Encoding GetFileEncoding(FileStream fs)
        {
            BinaryReader r = new BinaryReader(fs, Encoding.Default);
            int count = (int)fs.Length;
            int bytesLength = 1024 * 1024;
            if (count > bytesLength)
            {
                count = bytesLength;
            }
            byte[] data = r.ReadBytes(count);
            int dataLength = data.Length;
            if (dataLength > 3)
            {
                if (data[0] == 0xFE && data[1] == 0xFF && data[2] == 0x00)
                {
                    return Encoding.BigEndianUnicode;
                }
                else if (data[0] == 0xFF && data[1] == 0xFE && data[2] == 0x31)
                {
                    return Encoding.Unicode;
                }
                else if (data[0] == 0xEF && data[1] == 0xBB && data[2] == 0xBF)
                {
                    return Encoding.UTF8;
                }
            }
            return GetWithoutBomFile(data);
        }

文件没有bom判断文件的编码方式(不是100%成功判断，有一定的错误率)

1、无bom判断Encoding.UTF8

根据此规则可以来判断是否是utf8编码:

UNICODE    UTF-8
00000000 - 0000007F	0xxxxxxx
00000080 - 000007FF	110xxxxx 10xxxxxx
00000800 - 0000FFFF	1110xxxx 10xxxxxx 10xxxxxx     有考虑
00010000 - 001FFFFF	11110xxx 10xxxxxx 10xxxxxx 10xxxxxx   有考虑
00200000 - 03FFFFFF	111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
04000000 - 7FFFFFFF	1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx

以下代码中只考虑了上面规则中的第3、4种

   private static bool IsUtf8Bytes(byte[] data)
        {
            bool isType1 = false;
            int count1 = 0;
            bool isType2 = false;
            int count2 = 0;
            int dataLength = data.Length;
            for (int i = 0; i < dataLength; i++)
            {
                int item = data[i];
                if (item > 0xe0)
                {
                    if (isType1)
                    {
                        return false;
                    }
                    isType1 = true;
                }
                else
                {
                    if (isType1)
                    {
                        if (item > 0x80 && item < 0xc0)
                        {
                            count1++;
                            if (count1 == 2)
                            {
                                int index = i + 1;
                                if (index < dataLength)
                                {
                                    item = data[index];
                                    if (item <= 0x80 || item >= 0xc0)
                                    {
                                        return true;
                                    }
                                    else
                                    {
                                        return false;
                                    }
                                }
                                else
                                {
                                    return true;
                                }
                            }
                        }
                        else
                        {
                            return false;
                        }
                    }
                }
                if (item > 0xf0)
                {
                    if (isType2)
                    {
                        return false;
                    }
                    isType2 = true;
                }
                else
                {
                    if (isType2)
                    {
                        if (item > 0x80 && item < 0xc0)
                        {
                            count2++;
                            if (count2 == 3)
                            {
                                int index = i + 1;
                                if (index < dataLength)
                                {
                                    item = data[index];
                                    if (item <= 0x80 || item >= 0xc0)
                                    {
                                        return true;
                                    }
                                    else
                                    {
                                        return false;
                                    }
                                }
                                else
                                {
                                    return true;
                                }
                            }
                        }
                        else
                        {
                            return false;
                        }
                    }
                }
            }
            return false;
        }

2、无bom判断是Encoding.BigEndianUnicode还是Encoding.Unicode

每次从文件字节数组中读取两个字节，如果第一个字节为0，则为Encoding.BigEndianUnicode，如果第二个字节为0，则为Encoding.Unicode

 private static Encoding GetWithoutBomFile(byte[] data)
        {
            bool isSuccess = IsUtf8Bytes(data);
            if (isSuccess)
            {
                return Encoding.UTF8;
            }
            else
            {
                int dataLength = data.Length;
                for (int i = 0; i < dataLength; i++)
                {
                    byte byte1 = data[i];
                    i++;
                    if (i < dataLength)
                    {
                        byte byte2 = data[i];
                        if (byte2 == 0x00)
                        {
                            return Encoding.Unicode;
                        }
                        else if (byte1 == 0x00)
                        {
                            return Encoding.BigEndianUnicode;
                        }
                    }
                }
                return Encoding.Default;
            }
        }

2、多线程分段读取文件

文件为utf8编码读取方式-多线程每次分段读文件加锁，然后给读到的每段字节数组编一个编号，如果读到的这段发现末尾的字符有拆分的就把多余的字节存起来，放到下一段字节数组一起编码

 private bool JudgeFileBlockUtf8(FileStream fs, Encoding encoding, object lockObj, ref int fileBlockIndex, ConcurrentDictionary<int, byte[]> byteDic, CancellationTokenSource cts)
        {
            if (cts.IsCancellationRequested)
            {
                return false;
            }
            int fileBlockIndexOld = fileBlockIndex;
            int newCount;
            byte[] bytes = new byte[_bytesLength];
            lock (lockObj)
            {
                fileBlockIndex++;
                int count = fs.Read(bytes, 0, _bytesLength);
                newCount = count;
                int index = Array.FindLastIndex(bytes, b => b > 0xf0);
                bool isSplit = false;
                int diff;
                if (index != -1)
                {
                    diff = count - index;
                    if (diff == 1 || diff == 2 || diff == 3)
                    {
                        isSplit = true;
                    }
                }
                if (!isSplit)
                {
                    index = Array.FindLastIndex(bytes, b => b > 0xe0);
                    if (index != -1)
                    {
                        diff = count - index;
                        if (diff == 1 || diff == 2)
                        {
                            isSplit = true;
                        }
                    }
                }
                if (isSplit)
                {
                    newCount = index;
                    int count2 = 0;
                    byte[] bytes2 = new byte[4];
                    for (int i = index; i < count; i++)
                    {
                        bytes2[count2] = bytes[i];
                        count2++;
                    }
                    byteDic.TryAdd(fileBlockIndex, bytes2.Take(count2).ToArray());
                }
            }
            if (newCount <= 0)
            {
                return false;
            }
            byte[] bytes3;
            bool result = byteDic.TryGetValue(fileBlockIndexOld, out bytes3);
            string content = string.Empty;
            if (result)
            {
                int byte3Length = bytes3.Length;
                int newCount2 = newCount + byte3Length;
                byte[] bytes4 = new byte[newCount2];
                Array.Copy(bytes3, 0, bytes4, 0, byte3Length);
                Array.Copy(bytes, 0, bytes4, byte3Length, newCount);
                content = encoding.GetString(bytes4);
            }
            else
            {
                content = encoding.GetString(bytes.Take(newCount).ToArray());
            }
            if (content.Contains(tbSearchContent.Text))
            {
                return true;
            }
            return JudgeFileBlockUtf8(fs, encoding, lockObj, ref fileBlockIndex, byteDic, cts);
        }

非utf8编码读取方式-多线程每次分段读偶数个字节数然后编码

     private bool JudgeFileBlock(FileStream fs, Encoding encoding, CancellationTokenSource cts)
        {
            if (cts.IsCancellationRequested)
            {
                return false;
            }
            byte[] bytes = new byte[_bytesLength];
            int count = fs.Read(bytes, 0, _bytesLength);
            if (count <= 0)
            {
                return false;
            }
            string content = encoding.GetString(bytes.Take(count).ToArray());
            if (content.Contains(tbSearchContent.Text))
            {
                return true;
            }
            else
            {
                return JudgeFileBlock(fs, encoding, cts);
            }
        }

3、把每段文件字节数组用判断到的编码方式编码，之后查看是否有包含指定关键字

   private void JudgeFile(string file)
        {
            SetLog($"文件:{file}");
            if (file.Contains(tbSearchContent.Text))
            {
                SetContent(file);
            }
            else
            {
                string extension = Path.GetExtension(file);
                if (!string.IsNullOrEmpty(extension) && tbExtension.Text.Contains(extension))
                {
                    using (FileStream fs = new FileStream(file, FileMode.Open, FileAccess.Read))
                    {
                        Encoding encoding = GetFileEncoding(fs);
                        fs.Seek(0, SeekOrigin.Begin);
                        List<ManualResetEvent> manualEvents = new List<ManualResetEvent>();
                        for (int i = 0; i < _maxFileBlockTaskCount; i++)
                        {
                            ManualResetEvent manualResetEvent = new ManualResetEvent(false);
                            manualEvents.Add(manualResetEvent);
                            ThreadPool.QueueUserWorkItem(h =>
                            {
                                CancellationTokenSource cts = new CancellationTokenSource();
                                bool result;
                                if (encoding == Encoding.UTF8)
                                {
                                    ConcurrentDictionary<int, byte[]> byteDic = new ConcurrentDictionary<int, byte[]>();
                                    object lockObj = new object();
                                    int fileBlockIndex = 0;
                                    result = JudgeFileBlockUtf8(fs, encoding, lockObj, ref fileBlockIndex, byteDic, cts);
                                }
                                else
                                {
                                    result = JudgeFileBlock(fs, encoding, cts);
                                }
                                if (result)
                                {
                                    cts.Cancel();
                                    SetContent(file);
                                }
                            }, manualResetEvent);
                        }
                        WaitHandle.WaitAll(manualEvents.ToArray());
                    }
                }
            }
        }

代码地址

https://gitee.com/ffxxxdd/file-search.git

参考链接

https://blog.csdn.net/weixin_33924312/article/details/93937571
https://github.com/cyq1162/cyqdata/blob/master/Tool/IOHelper.cs

posted @ 2021-02-01 21:42 东东东阅读(142) 评论(0) 收藏举报

刷新页面返回顶部

东东东

文件查找

项目功能描述

目的

实现思路

1、遍历操作系统上的所有盘符,找到所有文件,把找到的文件路径存到集合中

2、创建指定数量的线程消费集合中的文件路径

3、消费文件路径

1、判断文件的编码方式(c#总共考虑的文件编码方式有:Encoding.Default、Encoding.BigEndianUnicode、Encoding.Unicode、Encoding.UTF8)

1、无bom判断Encoding.UTF8

2、无bom判断是Encoding.BigEndianUnicode还是Encoding.Unicode

2、多线程分段读取文件

3、把每段文件字节数组用判断到的编码方式编码，之后查看是否有包含指定关键字

代码地址

参考链接

公告