文件查找
项目功能描述
读取windows系统上的所有文件,根据输入的关键字找到包含关键字的文件
目的
- 了解文件的编码
- 多线程读取文件
- 多线程分段读取单个文件
实现思路
1、遍历操作系统上的所有盘符,找到所有文件,把找到的文件路径存到集合中
private ConcurrentQueue<string> _files = new ConcurrentQueue<string>();
Task task = new Task(() =>
{
foreach (string drive in Environment.GetLogicalDrives())
{
var dir = new DriveInfo(drive);
switch (dir.DriveType)
{
case DriveType.Fixed:
{
string name = dir.Name;
SetLog($"根路径{name}");
string[] files = Directory.GetFiles(name);
foreach (string file in files)
{
_files.Enqueue(file);
}
string[] dics = Directory.GetDirectories(name);
SearchDic(dics);
}
break;
}
}
_isSearchDicEnd = false;
SetLog("查找结束");
});
task.Start();
private void SearchDic(string[] dics)
{
foreach (var item in dics)
{
SetLog($"文件夹:{ new DirectoryInfo(item).FullName }");
try
{
string[] files = Directory.GetFiles(item);
foreach (string file in files)
{
_files.Enqueue(file);
}
string[] newDics = Directory.GetDirectories(item);
SearchDic(newDics);
}
catch (Exception)
{
}
}
}
2、创建指定数量的线程消费集合中的文件路径
private int _maxFileTaskCount = 10;
private void HandleFile()
{
for (var i = 0; i < _maxFileTaskCount; i++)
{
Task task = new Task(() =>
{
while (true)
{
string file = string.Empty;
bool isSuccess = false;
while (!isSuccess)
{
isSuccess = _files.TryDequeue(out file);
if (!isSuccess)
{
if (_isSearchDicEnd)
{
return;
}
Thread.Sleep(100);
}
}
JudgeFile(file);
}
});
task.Start();
}
}
3、消费文件路径
1、判断文件的编码方式(c#总共考虑的文件编码方式有:Encoding.Default、Encoding.BigEndianUnicode、Encoding.Unicode、Encoding.UTF8)
- 判断文件有bom,则直接根据bom判断文件的编码方式
private static Encoding GetFileEncoding(FileStream fs)
{
BinaryReader r = new BinaryReader(fs, Encoding.Default);
int count = (int)fs.Length;
int bytesLength = 1024 * 1024;
if (count > bytesLength)
{
count = bytesLength;
}
byte[] data = r.ReadBytes(count);
int dataLength = data.Length;
if (dataLength > 3)
{
if (data[0] == 0xFE && data[1] == 0xFF && data[2] == 0x00)
{
return Encoding.BigEndianUnicode;
}
else if (data[0] == 0xFF && data[1] == 0xFE && data[2] == 0x31)
{
return Encoding.Unicode;
}
else if (data[0] == 0xEF && data[1] == 0xBB && data[2] == 0xBF)
{
return Encoding.UTF8;
}
}
return GetWithoutBomFile(data);
}
- 文件没有bom判断文件的编码方式(不是100%成功判断,有一定的错误率)
1、无bom判断Encoding.UTF8
- 根据此规则可以来判断是否是utf8编码:
UNICODE UTF-8
00000000 - 0000007F 0xxxxxxx
00000080 - 000007FF 110xxxxx 10xxxxxx
00000800 - 0000FFFF 1110xxxx 10xxxxxx 10xxxxxx 有考虑
00010000 - 001FFFFF 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx 有考虑
00200000 - 03FFFFFF 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
04000000 - 7FFFFFFF 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
以下代码中只考虑了上面规则中的第3、4种
private static bool IsUtf8Bytes(byte[] data)
{
bool isType1 = false;
int count1 = 0;
bool isType2 = false;
int count2 = 0;
int dataLength = data.Length;
for (int i = 0; i < dataLength; i++)
{
int item = data[i];
if (item > 0xe0)
{
if (isType1)
{
return false;
}
isType1 = true;
}
else
{
if (isType1)
{
if (item > 0x80 && item < 0xc0)
{
count1++;
if (count1 == 2)
{
int index = i + 1;
if (index < dataLength)
{
item = data[index];
if (item <= 0x80 || item >= 0xc0)
{
return true;
}
else
{
return false;
}
}
else
{
return true;
}
}
}
else
{
return false;
}
}
}
if (item > 0xf0)
{
if (isType2)
{
return false;
}
isType2 = true;
}
else
{
if (isType2)
{
if (item > 0x80 && item < 0xc0)
{
count2++;
if (count2 == 3)
{
int index = i + 1;
if (index < dataLength)
{
item = data[index];
if (item <= 0x80 || item >= 0xc0)
{
return true;
}
else
{
return false;
}
}
else
{
return true;
}
}
}
else
{
return false;
}
}
}
}
return false;
}
2、无bom判断是Encoding.BigEndianUnicode还是Encoding.Unicode
- 每次从文件字节数组中读取两个字节,如果第一个字节为0,则为Encoding.BigEndianUnicode,如果第二个字节为0,则为Encoding.Unicode
private static Encoding GetWithoutBomFile(byte[] data)
{
bool isSuccess = IsUtf8Bytes(data);
if (isSuccess)
{
return Encoding.UTF8;
}
else
{
int dataLength = data.Length;
for (int i = 0; i < dataLength; i++)
{
byte byte1 = data[i];
i++;
if (i < dataLength)
{
byte byte2 = data[i];
if (byte2 == 0x00)
{
return Encoding.Unicode;
}
else if (byte1 == 0x00)
{
return Encoding.BigEndianUnicode;
}
}
}
return Encoding.Default;
}
}
2、多线程分段读取文件
- 文件为utf8编码读取方式-多线程每次分段读文件加锁,然后给读到的每段字节数组编一个编号,如果读到的这段发现末尾的字符有拆分的就把多余的字节存起来,放到下一段字节数组一起编码
private bool JudgeFileBlockUtf8(FileStream fs, Encoding encoding, object lockObj, ref int fileBlockIndex, ConcurrentDictionary<int, byte[]> byteDic, CancellationTokenSource cts)
{
if (cts.IsCancellationRequested)
{
return false;
}
int fileBlockIndexOld = fileBlockIndex;
int newCount;
byte[] bytes = new byte[_bytesLength];
lock (lockObj)
{
fileBlockIndex++;
int count = fs.Read(bytes, 0, _bytesLength);
newCount = count;
int index = Array.FindLastIndex(bytes, b => b > 0xf0);
bool isSplit = false;
int diff;
if (index != -1)
{
diff = count - index;
if (diff == 1 || diff == 2 || diff == 3)
{
isSplit = true;
}
}
if (!isSplit)
{
index = Array.FindLastIndex(bytes, b => b > 0xe0);
if (index != -1)
{
diff = count - index;
if (diff == 1 || diff == 2)
{
isSplit = true;
}
}
}
if (isSplit)
{
newCount = index;
int count2 = 0;
byte[] bytes2 = new byte[4];
for (int i = index; i < count; i++)
{
bytes2[count2] = bytes[i];
count2++;
}
byteDic.TryAdd(fileBlockIndex, bytes2.Take(count2).ToArray());
}
}
if (newCount <= 0)
{
return false;
}
byte[] bytes3;
bool result = byteDic.TryGetValue(fileBlockIndexOld, out bytes3);
string content = string.Empty;
if (result)
{
int byte3Length = bytes3.Length;
int newCount2 = newCount + byte3Length;
byte[] bytes4 = new byte[newCount2];
Array.Copy(bytes3, 0, bytes4, 0, byte3Length);
Array.Copy(bytes, 0, bytes4, byte3Length, newCount);
content = encoding.GetString(bytes4);
}
else
{
content = encoding.GetString(bytes.Take(newCount).ToArray());
}
if (content.Contains(tbSearchContent.Text))
{
return true;
}
return JudgeFileBlockUtf8(fs, encoding, lockObj, ref fileBlockIndex, byteDic, cts);
}
- 非utf8编码读取方式-多线程每次分段读偶数个字节数然后编码
private bool JudgeFileBlock(FileStream fs, Encoding encoding, CancellationTokenSource cts)
{
if (cts.IsCancellationRequested)
{
return false;
}
byte[] bytes = new byte[_bytesLength];
int count = fs.Read(bytes, 0, _bytesLength);
if (count <= 0)
{
return false;
}
string content = encoding.GetString(bytes.Take(count).ToArray());
if (content.Contains(tbSearchContent.Text))
{
return true;
}
else
{
return JudgeFileBlock(fs, encoding, cts);
}
}
3、把每段文件字节数组用判断到的编码方式编码,之后查看是否有包含指定关键字
private void JudgeFile(string file)
{
SetLog($"文件:{file}");
if (file.Contains(tbSearchContent.Text))
{
SetContent(file);
}
else
{
string extension = Path.GetExtension(file);
if (!string.IsNullOrEmpty(extension) && tbExtension.Text.Contains(extension))
{
using (FileStream fs = new FileStream(file, FileMode.Open, FileAccess.Read))
{
Encoding encoding = GetFileEncoding(fs);
fs.Seek(0, SeekOrigin.Begin);
List<ManualResetEvent> manualEvents = new List<ManualResetEvent>();
for (int i = 0; i < _maxFileBlockTaskCount; i++)
{
ManualResetEvent manualResetEvent = new ManualResetEvent(false);
manualEvents.Add(manualResetEvent);
ThreadPool.QueueUserWorkItem(h =>
{
CancellationTokenSource cts = new CancellationTokenSource();
bool result;
if (encoding == Encoding.UTF8)
{
ConcurrentDictionary<int, byte[]> byteDic = new ConcurrentDictionary<int, byte[]>();
object lockObj = new object();
int fileBlockIndex = 0;
result = JudgeFileBlockUtf8(fs, encoding, lockObj, ref fileBlockIndex, byteDic, cts);
}
else
{
result = JudgeFileBlock(fs, encoding, cts);
}
if (result)
{
cts.Cancel();
SetContent(file);
}
}, manualResetEvent);
}
WaitHandle.WaitAll(manualEvents.ToArray());
}
}
}
}
代码地址
https://gitee.com/ffxxxdd/file-search.git
参考链接
https://blog.csdn.net/weixin_33924312/article/details/93937571
https://github.com/cyq1162/cyqdata/blob/master/Tool/IOHelper.cs

浙公网安备 33010602011771号