C#搜索抓取本地硬盘文件
问题描述:譬如说要搜索d:盘中的所有后缀为html的文件。这里只是简单实现一个抓取的类,大家可以根据需要自行扩展
关键是用到一个递归函数,依此搜索子文件夹下的文件,按照时间保存
public class SearchBase
{
private SearchBase()
{
}
static int ra;
public static void addSubDirectory(DirectoryInfo directory,string pattern)
{
foreach (FileInfo fi in directory.GetFiles(pattern))
{
addrelativeDocument(fi.FullName);
}
foreach (DirectoryInfo di in directory.GetDirectories())
{
addSubDirectory(di,pattern);
}
}
public static void addrelativeDocument(string path)
{
string html;
using (StreamReader sr = new StreamReader(path, System.Text.Encoding.Default))
{
html = sr.ReadToEnd();
}
string time=DateTime.Now.ToLongTimeString();
string[] timetopath=time.Split(':');
string newtime=null;
for(int i=0;i<timetopath.Length;i++)
{
newtime+=timetopath[i].ToString();
}
newtime+="("+ra+")";
string p=@"f:\files\"+newtime+".txt";
if(!File.Exists(p))
{
FileStream fs= new FileStream(p ,FileMode.Create,FileAccess.Write ) ;
StreamWriter sw = new StreamWriter(fs);
sw.Write(html);
sw.Close() ;
fs.Close() ;
}
ra++;
}
}
{
private SearchBase()
{
}
static int ra;
public static void addSubDirectory(DirectoryInfo directory,string pattern)
{
foreach (FileInfo fi in directory.GetFiles(pattern))
{
addrelativeDocument(fi.FullName);
}
foreach (DirectoryInfo di in directory.GetDirectories())
{
addSubDirectory(di,pattern);
}
}
public static void addrelativeDocument(string path)
{
string html;
using (StreamReader sr = new StreamReader(path, System.Text.Encoding.Default))
{
html = sr.ReadToEnd();
}
string time=DateTime.Now.ToLongTimeString();
string[] timetopath=time.Split(':');
string newtime=null;
for(int i=0;i<timetopath.Length;i++)
{
newtime+=timetopath[i].ToString();
}
newtime+="("+ra+")";
string p=@"f:\files\"+newtime+".txt";
if(!File.Exists(p))
{
FileStream fs= new FileStream(p ,FileMode.Create,FileAccess.Write ) ;
StreamWriter sw = new StreamWriter(fs);
sw.Write(html);
sw.Close() ;
fs.Close() ;
}
ra++;
}
}
前台这么简单调用就可以了
SearchBase.addSubDirectory(new DirectoryInfo(@"d:\"),"*.html");
另外,可以自己去加工搜索出来的文件,解析成xml文件,将其导入到数据库中,这个以后有空写篇随笔~
浙公网安备 33010602011771号