系统级程序设计结课实验-第二部分
这一部分主要说怎么生成新的数据集
先贴出原数据集生成的一个c文件吧,不然都不知道长什么样子
#include <stdlib.h> // Tag.OTHER
int main() // Tag.OTHER
{ // Tag.OTHER
int entity_3; // Tag.BODY
int entity_4; // Tag.BODY
entity_3 = 30; // Tag.BODY
entity_4 = 1; // Tag.BODY
char entity_7[96]; // Tag.BODY
if(entity_3 < entity_4){ // Tag.BODY
entity_3 = 56; // Tag.BODY
} else { // Tag.BODY
entity_3 = 74; // Tag.BODY
} // Tag.BODY
entity_7[entity_3] = '5'; // Tag.BUFWRITE_COND_SAFE
int entity_6; // Tag.BODY
int entity_8; // Tag.BODY
char entity_2[24]; // Tag.BODY
char entity_0[55]; // Tag.BODY
entity_8 = 3; // Tag.BODY
entity_6 = 63; // Tag.BODY
entity_2[entity_8] = 'Q'; // Tag.BUFWRITE_TAUT_SAFE
entity_0[entity_6] = 'P'; // Tag.BUFWRITE_TAUT_UNSAFE
return 0; // Tag.BODY
} // Tag.OTHER
原数据集生成的原理
先贴generate.py(主要是通过这个文件生成代码)的main函数部分代码
outdir = args.outdir
seed = int(args.seed)
num_instances = int(args.num_instances)
taut_only = args.taut_only
linear_only = args.linear_only
# check paths
outdir = os.path.abspath(os.path.expanduser(outdir))
if not os.path.isdir(outdir):
raise OSError("outdir does not exist: '{}'".format(outdir))
# set seed
if seed != -1:
random.seed(seed)
generators = [gen_cond_example, gen_while_example, gen_for_example,
gen_fv_cond_example, gen_fv_while_example, gen_fv_for_example]
if linear_only:
generators = [gen_tautonly_linear_example]
num_generators = len(generators)
# Generate metadata only if the metadata_file argument is present
generate_metadata = args.metadata_file is not None
# This dict is used to store instance metadata
tag_metadata = {}
inst_num = 0
while inst_num < num_instances:
# generate example
gen = generators[inst_num % num_generators]
if gen is gen_tautonly_linear_example:
instance_str, tags = gen()
else:
include_cond_bufwrite = not taut_only
instance_str, tags = gen(
include_cond_bufwrite=include_cond_bufwrite)
# generate filename
byte_obj = bytes(instance_str, 'utf-8')
fname = hashlib.shake_128(byte_obj).hexdigest(FNAME_HASHLEN)
fname = "{}.c".format(fname)
if fname in tag_metadata:
# Collision, try again
continue
# insert record into metadata for this c file
tag_metadata[fname] = [tag.value for tag in tags]
inst_num += 1
# write to file
path = os.path.join(outdir, fname)
with open(path, 'w') as f:
f.write(instance_str)
稍加分析可以发现,生成测试集的逻辑是遍历generators数组,然后挨个执行里面的函数来生成c代码,现在我们的目标就很明确了,模仿gen_xxx_example函数使之生成我们想要的代码(其实还有别的方法,之后再讲),接着我们看看gen_cond_example函数是怎么写的
def gen_cond_example(include_cond_bufwrite=True):
"""Generate conditional example
Returns:
instance_str (str): str of code example
tags (list of Tag): tag for each line representing buffer safety
"""
anon_vars = _get_anon_vars()
buf_var, idx_var, thresh_var = anon_vars[:3]
dummy_vars = anon_vars[3:]
thresh = random.randrange(MAX_IDX)
idx_init = random.randrange(MAX_IDX)
buf_len = random.randrange(MAX_IDX)
true_idx = random.randrange(MAX_IDX)
false_idx = random.randrange(MAX_IDX)
char = _get_char()
substitutions = {
'buf_var': buf_var,
'idx_var': idx_var,
'buf_len': buf_len,
'thresh': thresh,
'thresh_var': thresh_var,
'idx_init': idx_init,
'true_idx': true_idx,
'false_idx': false_idx,
'char': char
}
main_lines = templates.COND_MAIN_LINES
cond = idx_init < thresh
safe = ((cond and (true_idx < buf_len)) or
(not cond and (false_idx < buf_len)))
dec_init_pairs = templates.COND_DEC_INIT_PAIRS
return _assemble_general_example(dec_init_pairs, main_lines, dummy_vars,
safe, substitutions,
include_cond_bufwrite)
这个函数其实就是先通过_get_anon_vars获取十个随机的变量,指的是变量名是随机的,然后拿前三个分别作为buf_var, idx_var, thresh_var,后面几个暂时放在dummy_vars中,依样画葫芦就行,safe那行代码就是用来判断数组访问是否越界的,最后 我们会看到其中用了templates这个类,我们再去template.py看看,大概是长这个样子
BUFWRITE_LINES = ["$buf_var[$idx_var] = '$char';"]
# templates for functions without free variables
COND_DEC_INIT_PAIRS = [
("char $buf_var[$buf_len];", None),
("int $idx_var;", "$idx_var = $idx_init;"),
("int $thresh_var;", "$thresh_var = $thresh;")
]
COND_MAIN_LINES = [
"if($idx_var < $thresh_var){",
"$idx_var = $true_idx;",
"} else {",
"$idx_var = $false_idx;",
"}"
]
WHILE_DEC_INIT_PAIRS = [
("char $buf_var[$buf_len];", None),
("int $idx_var;", "$idx_var = $idx_init;"),
("int $max_var;", "$max_var = $max_idx;")
]
WHILE_MAIN_LINES = [
"while($idx_var < $max_var){",
"$idx_var++;",
"}"
]
其实就是个模板,所以我们需要给自己准备的错误类型也弄个模板,我就放个简单的上溢出的模板吧。
ADD_DEC_INIT_PAIRS = [
("int $var1;", "$var1 = $var1_value;"),
("int $var2;", "$var2 = $var2_value;")
]
ADD_MAIN_LINES = [
"$var1 = $var1 + $var2;"
]
然后我们继续看_assemble_general_example函数,这个函数代码如下
def _assemble_general_example(dec_init_pairs, main_lines, dummy_vars,
safe, substitutions, include_cond_bufwrite):
"""Get instance lines, convert to string, generate tags
Args:
dec_init_pairs (list of tuple): declaration/initialization statements,
e.g. those in templates.py
main_lines (list of str): lines with the conditional or loop,
e.g. those in templates.py
dummy_vars (list of str): variable names available for use
safe (bool): whether the conditional buffer write is safe
substitutions (dict): names to substitute into templates
include_cond_bufwrite (bool): whether to include the
control flow-sensitive buffer write
Returns:
instance_str (str): str of code example
tags (list of Tag): tag for each line representing buffer safety
Ensures:
len(instance_str.split("\n")) == len(tags)
"""
if include_cond_bufwrite:
# copy to avoid changing the template list due to aliasing
main_lines = main_lines[:]
main_lines += templates.BUFWRITE_LINES
else:
safe = None
lines, body_tags = _get_lines(dec_init_pairs, main_lines,
dummy_vars, safe, include_cond_bufwrite)
tags = _get_tags(body_tags)
instance_str = _get_instance_str(lines, substitutions,
templates.FUNC_TMPL_STR, tags)
return instance_str, tags
从中间的if语句开始是正文,就是根据include_cond_bufwrite决定是不是要在main_lines后面加依据访问数组的语句,include_cond_bufwrite是指有没有条件,循环控制流存在,如果有就在控制流结束的地方添加数组访问语句。_get_instance_str是用实际的指代替模板的响应变量,就是填值的函数,先看看_get_lines函数
def _get_lines(dec_init_pairs, main_lines, dummy_vars, safe,
include_cond_bufwrite):
"""Create full body lines with setup, main content, and dummy interaction
Args:
dec_init_pairs (list of tuple)
main_lines (list of str): lines that use the declared vars
dummy_vars (list of str): variable names available for dummy use
safe (bool): whether the query line access is safe (for tags)
or None, if no conditional query line should be added
include_cond_bufwrite (bool): whether to include the
control flow-sensitive buffer write
Returns:
lines (list of str)
body_tags (list of Tag instances): tags for each body line
"""
setup_lines = _get_setup_lines(dec_init_pairs)
lines = setup_lines + main_lines
# construct body tags before adding dummies
body_tags = [Tag.BODY for _ in lines]
if include_cond_bufwrite:
query_tag = Tag.BUFWRITE_COND_SAFE if safe else Tag.BUFWRITE_COND_UNSAFE
body_tags[-1] = query_tag
min_num_dummies = 0 if include_cond_bufwrite else MIN_NUM_DUMMIES_TAUTONLY
num_dummies = random.randrange(min_num_dummies, MAX_NUM_DUMMIES + 1)
lines, body_tags = _insert_dummies(
setup_lines, main_lines, dummy_vars, num_dummies, body_tags,
include_cond_bufwrite)
return lines, body_tags
正文从setup_lines开始,比较重要的是 if include_cond_bufwrite这一条件语句,指的是如果有控制流,那么query_tag的位置应该在body_tags的最后,因为如果有控制流的话整个文件先是生成控制流对应的代码,然后随机再生成一些代码,此时就在这两个过程之间。还没有生成后面的随机代码,所以query_tag一定要放在body_tags的最后。num_dummies是剩余可用的变量名,程序一开始设置了只有10个还是多少个变量可以使用,前面生成控制流代码已经用去了一部分,剩下的用来生成控制无关(结果不依赖于控制流)的代码。很明显我们需要在这个 if include_cond_bufwrite后面加上我们新加的tag,我们看看sa_tag.py文件(我就不贴出来了,内容很简单),会发现其实这里面就定义了一些tag的常量,我们加上自己的
MATH_OVERFLOW_SAFE = 6
MATH_OVERFLOW_UNSAFE = 7
,然后回到_get_lines修改if语句
if include_math_overflow:
query_tag = Tag.MATH_OVERFLOW_SAFE if safe else Tag.MATH_OVERFLOW_UNSAFE
body_tags[-1] = query_tag
elif include_cond_bufwrite:
query_tag = Tag.BUFWRITE_COND_SAFE if safe else Tag.BUFWRITE_COND_UNSAFE
body_tags[-1] = query_tag
include_math_overflow是作为参数传递过来的,默认为False,只有调用自己写的gen_xxx_example才会设置为Ture
类似的,我们看看_get_lines最后调用的的_insert_dummies函数
def _insert_dummies(setup_lines, main_lines, dummy_vars, num_dummies,
body_tags, include_cond_bufwrite
):
"""Insert dummy array declare/set pairs (all safe sets)
Args:
setup_lines (list of str): declaration and initialization lines
main_lines (list of str): control flow lines
dummy_vars (list of str): variable names available for dummy use
num_dummies (int): number of dummy vars to insert
body_tags (list of Tag instances): tags before adding dummies
include_cond_bufwrite (bool): whether to include the
control flow-sensitive buffer write
Returns:
lines (list of str): with dummy dec/set pairs added
body_tags (list of Tag instances): with tags added for dummy lines
"""
lines = setup_lines + main_lines
# first line of control flow, inclusive
control_flow_start = len(setup_lines)
# last line of control flow, exclusive
control_flow_end = len(setup_lines + main_lines)
if include_cond_bufwrite :
control_flow_end -= 1
for _ in range(num_dummies):
(lines, dummy_vars, body_tags, control_flow_start, control_flow_end
) = _insert_referential_dummy(
lines, dummy_vars, body_tags, control_flow_start,
control_flow_end)
return lines, body_tags
我们只关心和include_cond_bufwrite相关的地方,就是
if include_cond_bufwrite :
control_flow_end -= 1
这一句,这句的意思是如果存在控制流e,控制流结束的位置减1。上面说过如果存在控制流的话会在代码最后(生成控制无关代码之前)添加数组访问语句,此时control_flow_end 默认标识的位置为代码末尾,因此要做修正,我设计的上溢出与控制流无关,因此不需要修改,所以后面加上and not include_math_overflow就可以了,同样地include_math_overflow也是作为参数传递过来,默认False。
最后我们终于要添加gen_xxx_example函数了,我自己模仿写的代码是这样的
def gen_add_overflow_example(include_cond_bufwrite=True):
"""Generate conditional example
Returns:
instance_str (str): str of code example
tags (list of Tag): tag for each line representing buffer safety
"""
anon_vars = _get_anon_vars()
var1, var2 = anon_vars[:2]
dummy_vars = anon_vars[2:]
var1_value =2147483647 - 2147483647 // random.randrange(1,5)
var2_value =2147483647 - 2147483647 // random.randrange(1,5)
substitutions = {
'var1': var1,
'var2': var2,
'var1_value': var1_value,
'var2_value': var2_value,
}
main_lines = templates.ADD_MAIN_LINES
check_value = var1_value + var2_value
safe = check_value <= 2147483647
dec_init_pairs = templates.ADD_DEC_INIT_PAIRS
return _assemble_general_example(dec_init_pairs, main_lines, dummy_vars,
safe, substitutions,
include_cond_bufwrite,True)
逻辑很简单,就不解释了,弄完之后记得在generators加上这个函数然后注释掉test(),不然没法生成代码。(如果发现生成的数量很少可以往generators多加几个这个函数或者只用这个函数)。下面是我生成的一个代码
#include <stdlib.h> // Tag.OTHER
#include <stdio.h> // Tag.OTHER
int main() // Tag.OTHER
{ // Tag.OTHER
int entity_8; // Tag.BODY
int entity_5; // Tag.BODY
entity_8 = 0; // Tag.BODY
entity_5 = 1610612735; // Tag.BODY
int entity_2; // Tag.BODY
entity_2 = 60; // Tag.BODY
entity_5 = entity_5 + entity_8; // Tag.MATH_OVERFLOW_SAFE
char entity_0[51]; // Tag.BODY
entity_0[entity_2] = 'n'; // Tag.BUFWRITE_TAUT_UNSAFE
return 0; // Tag.BODY
} // Tag.OTHER

浙公网安备 33010602011771号