一、博客链接和Github链接

郑孔宇 俞凯欣

github项目地址

二、具体分工:

基本功能部分:郑孔宇
爬虫工具、测试及附加题部分:俞凯欣

三、PSP表格:

PSP2.1 Personal Software Process Stages 预估耗时(分钟) 实际耗时(分钟)
Planning 计划 10 10
· Estimate · 估计这个任务需要多少时间 540 620
Development 开发 0 0
· Analysis · 需求分析 (包括学习新技术) 120 60
· Design Spec · 生成设计文档 20 20
· Design Review · 设计复审 20 20
· Coding Standard · 代码规范 (为目前的开发制定合适的规范) 0 0
· Design · 具体设计 20 20
· Coding · 具体编码 220 360
· Code Review · 代码复审 40 40
· Test · 测试(自我测试,修改代码,提交修改) 60 60
Reporting 报告 0 0
· Test Repor · 测试报告 0 0
· Size Measurement · 计算工作量 0 0
· Postmortem & Process Improvement Plan · 事后总结, 并提出过程改进计划 30 3

四、爬虫工具

爬虫工具使用的八爪鱼,通过点击需要爬取的信息,来获取相同类型的信息进行选定操作,然后制作流程图来完成爬虫,爬取完毕后再进行导出操作即可。


五、需求分析

六、功能实现

1.主函数
  • 获取命令行中的所有指令,并执行相关函数
int main(int args, char* argv[])
{
	char* a = NULL;
	char* b = NULL;
	char* c = NULL;
	char* d = NULL;
	char* e = NULL;
	string cstr, dstr, estr;
	int i;
	int w = 0, m = 0, n = 0;
	for (i = 0; i < args; i++)
	{
		if (strcmp(argv[i], "-i") == 0)  //检测-i指令
		{
			a = argv[i + 1];
		}
		if (strcmp(argv[i], "-o") == 0) //检测-o指令
		{
			b = argv[i + 1];
		}
		if (strcmp(argv[i], "-w") == 0) //检测-w指令
		{
			c = argv[i + 1];
			cstr = c;
			w = atoi(cstr.c_str());
		}
		if (strcmp(argv[i], "-n") == 0) //检测-n指令
		{
			d = argv[i + 1];
			dstr = d;
			n = atoi(dstr.c_str()); 
		}
		if (strcmp(argv[i], "-m") == 0) //检测-m指令
		{
			e = argv[i + 1];
			estr = e;
			m = atoi(estr.c_str());
		}
	}
	readtxt(a); //读取文件并获取所有字符数
	divide_n(w); //获取行数 单词数 排除Title: Abstract: 和编号后的字符数,并分割、存入和排序单词
	if (m == 0)
	{		
		writetxt_n(b, n); //输出无-m时候的格式
	}
	else
	{
		readtxt2(a); //读取文件不改变字符数,用于重新分割
		divide_m(w, m); //分割单词若满足词组条件则存入并排序
		writetxt_m(b); //输出有-m时候的格式
	}
}
2.词频统计(divide_n) (writetxt_n)

void divide_n(int w)
{
	size_t length;
	string wordstring;
	char wordchar[999];
	int w2 = 1;
	int i, j, k = 0;
	int pos = 0;
	const char *sep = "./;'[] \\<>?:\"{}|`~!@#$%^&*()_+-=\n"; //需要分割的字符
	char *p;
	char *buf;
	p = strtok_s(s, sep, &buf);
	while (p)
	{
		wordstring = p;
		strcpy_s(wordchar, wordstring.c_str());
		if (strcmp(wordchar, "Title") == 0)
		{
			charnum -= 11;
			linenum++;
			w2 = w;
		} // 出现Title 权重为w;
		else if (strcmp(wordchar, "Abstract") == 0)
		{
			charnum -= 10;
			linenum++;
			w2 = 1;
		}// 出现Abstract 权重为1;
		else
		{
			length = wordstring.length();
			for (i = 0; i <= length; i++)
			{
				if (wordchar[i] >= 'A' && wordchar[i] <= 'Z')
				{
					wordchar[i] = wordchar[i] + 32;
				}
			}
			wordstring = wordchar;
			if (wordstring.length() >= 4)
			{
				for (j = 0; j <= 3; j++)//判断该单词是否符合前四位为字母
				{
					if (wordchar[j] >= 'a' && wordchar[j] <= 'z')
						pos = 1;
					else
					{
						pos = 0;
						break;
					}
				}
			}
			if (pos == 1)
			{
				wordnum++;
				if (w_c.count(wordstring) == 0)
				{
					w_c.insert(make_pair(wordstring, w2));
				}
				else
				{
					w_c[wordstring] += w2;
				}
				pos = 0;
			}
		}
		p = strtok_s(NULL, sep, &buf);
	}
	for (w_c_iter = w_c.begin(); w_c_iter != w_c.end(); w_c_iter++)
	{
		w_c2.push_back(make_pair(w_c_iter->first, w_c_iter->second));
	}
	sort(w_c2.begin(), w_c2.end(), Comp);
}
void writetxt_n(char *b, int n)
{
	char charnum_s[10], wordnum_s[10], linenum_s[10];
	char num_s[10];
	string res;
	char res_c[200000];
	_itoa_s(charnum + 2, charnum_s, 10);
	_itoa_s(wordnum, wordnum_s, 10);
	_itoa_s(linenum, linenum_s, 10);
	res = res + "characters: " + charnum_s + "\n";
	res = res + "words: " + wordnum_s + "\n";
	res = res + "lines: " + linenum_s + "\n";
	if (n == 0)
	{
		n = 10;
	}
	if (w_c2.size() >= n)
	{
		for (w_c2_iter = w_c2.begin(); w_c2_iter != w_c2.begin() + n; w_c2_iter++)
		{
			_itoa_s(w_c2_iter->second, num_s, 10);
			res = res + "<" + w_c2_iter->first + ">: " + num_s + "\n";
		}
	}
	else
	{
		for (w_c2_iter = w_c2.begin(); w_c2_iter != w_c2.end(); w_c2_iter++)
		{
			_itoa_s(w_c2_iter->second, num_s, 10);
			res = res + "<" + w_c2_iter->first + ">: " + num_s + "\n";
		}
	}
	strcpy_s(res_c, res.c_str());
	FILE *fp1;
	errno_t err;
	err = fopen_s(&fp1, b, "w");
	fwrite(res_c, res.length(), 1, fp1);
}
3.词组统计(divide_m) (writetxt_m)

void divide_m(int w, int m)
{
	size_t length;
	int cznum = 0;
	string cz;
	string wordstring;
	char wordchar[999];
	int w2 = 1;
	int i, j, k = 0;
	int pos = 0;
	const char *sep = "./;'[] \\<>?:\"{}|`~!@#$%^&*()_+-=\n"; //需要分割的字符
	char *p = NULL;
	char *buf;
	p = strtok_s(s, sep, &buf);
	while (p)
	{
		wordstring = p;
		strcpy_s(wordchar, wordstring.c_str());
		if (strcmp(wordchar, "Title") == 0)
		{
			w2 = w;
			while (cz_q1.empty() == 0)
			{
				cz_q1.pop();
			}
			while (cz_q2.empty() == 0)
			{
				cz_q2.pop();
			}
		} // 出现Title 权重为w;
		else if (strcmp(wordchar, "Abstract") == 0)
		{
			w2 = 1;
			while (cz_q1.empty() == 0)
			{
				cz_q1.pop();
			}
			while (cz_q2.empty() == 0)
			{
				cz_q2.pop();
			}
		}// 出现Abstract 权重为1;
		else
		{
			length = wordstring.length();
			for (i = 0; i <= length; i++)
			{
				if (wordchar[i] >= 'A' && wordchar[i] <= 'Z')
				{

					wordchar[i] = wordchar[i] + 32;
				}
			}
			wordstring = wordchar;
			if (wordstring.length() >= 4) //合法pos=1  不合法pos=0
			{
				for (j = 0; j <= 3; j++)//判断该单词是否符合前四位为字母
				{
					if (wordchar[j] >= 'a' && wordchar[j] <= 'z')
					{
						pos = 1;
					}
					else
					{
						pos = 0;
						break;
					}
				}
			}
			else
			{
				pos = 0;
			}

			if (pos == 1)
			{
				if (cz_q2.size() == 0)
				{
					cz = "";
				}
				cz_q1.push(wordstring); //将合法单词入队q1
				cz_q2.push(wordstring); //将合法单词入队q2
				if (cz_q2.size() == m)
				{
					cz_q1.pop();	//若满足条件称为词组 则q1的首个单词出队
					for (i = 1; i <= m; i++)   //q2的所有单词存入cz中 用于输出并清空q2
					{
						if (i == m)
						{
							cz = cz + cz_q2.front();
							cz_q2.pop();
						}
						else
						{
							cz = cz + cz_q2.front() + " ";
							cz_q2.pop();
						}
					}
					if (cz_c.count(cz) == 0) //查询map中是否有该词组 无则将 词组,频率 引入  有则将原有 词组的频率累加
					{
						cz_c.insert(make_pair(cz, w2));
						cz = "";
					}
					else
					{
						cz_c[cz] += w2;
						cz = "";
					}
					for (j = 1; j <= cz_q1.size(); j++)  //将q1中剩余单词存入pop[]中 同步存入q2
					{
						pop[j] = cz_q1.front();
						cz_q1.pop();
						cz_q1.push(pop[j]);
						cz_q2.push(pop[j]);
					}
				}
			}
			else if (pos == 0) //当遇到非法单词 将两个队列清空
			{
				while (cz_q1.empty() == 0)
				{
					cz_q1.pop();
				}
				while (cz_q2.empty() == 0)
				{
					cz_q2.pop();
				}
			}
		}
		p = strtok_s(NULL, sep, &buf);
	}
	for (cz_c_iter = cz_c.begin(); cz_c_iter != cz_c.end(); cz_c_iter++)
	{
		cz_c2.push_back(make_pair(cz_c_iter->first, cz_c_iter->second));
	}
	sort(cz_c2.begin(), cz_c2.end(), Comp);
}
void writetxt_m(char *b)
{
	char charnum_s[10], wordnum_s[10], linenum_s[10];
	char num_s[10];
	string res;
	char res_c[200000];
	_itoa_s(charnum + 2, charnum_s, 10);
	_itoa_s(wordnum, wordnum_s, 10);
	_itoa_s(linenum, linenum_s, 10);
	res = res + "characters: " + charnum_s + "\n";
	res = res + "words: " + wordnum_s + "\n";
	res = res + "lines: " + linenum_s + "\n";
	if (cz_c2.size() >= 10)
	{
		for (cz_c2_iter = cz_c2.begin(); cz_c2_iter != cz_c2.begin() + 10; cz_c2_iter++)
		{
			_itoa_s(cz_c2_iter->second, num_s, 10);
			res = res + "<" + cz_c2_iter->first + ">: " + num_s + "\n";
		}
	}
	else
	{
		for (cz_c2_iter = cz_c2.begin(); cz_c2_iter != cz_c2.end(); cz_c2_iter++)
		{
			_itoa_s(cz_c2_iter->second, num_s, 10);
			res = res + "<" + cz_c2_iter->first + ">: " + num_s + "\n";
		}
	}
	strcpy_s(res_c, res.c_str());
	FILE *fp1;
	errno_t err;
	err = fopen_s(&fp1, b, "w");
	fwrite(res_c, res.length(), 1, fp1);
}

七、测试结果

  • 输入
  • 输出

八、性能分析

九、附加功能

  • 代码
import json

from pyecharts import Bar, Line, Scatter, EffectScatter, Grid, WordCloud, Graph, Page
name = ["learning","with","image","from","network","that","deep","networks","this","video","visual","neural","detection","model","segmentation","multi"]
value = [2879,2744,2306,1826,1757,1757,1735,1510,1423,1088,1030,952,938,909,889,827]

wordcloud = WordCloud("CVPR热词图谱")
wordcloud.add("", name, value, word_size_range=[20, 100])
wordcloud.render()

十、评价队友

相当棒棒

十一、学习记录

第N周 新增代码 累计代码 本周学习时间 累计学习时间(小时) 重要成长
1 200 200 5 5 对Axure的学习
5 200 400 12 17 html,css的学习
5 400 800 8 25 对c中各种函数的学习