Loading

系统级程序设计结课实验-第二部分

这一部分主要说怎么生成新的数据集

先贴出原数据集生成的一个c文件吧,不然都不知道长什么样子

#include <stdlib.h>           // Tag.OTHER
int main()                    // Tag.OTHER
{                             // Tag.OTHER
    int entity_3;             // Tag.BODY
    int entity_4;             // Tag.BODY
    entity_3 = 30;            // Tag.BODY
    entity_4 = 1;             // Tag.BODY
    char entity_7[96];        // Tag.BODY
    if(entity_3 < entity_4){  // Tag.BODY
    entity_3 = 56;            // Tag.BODY
    } else {                  // Tag.BODY
    entity_3 = 74;            // Tag.BODY
    }                         // Tag.BODY
    entity_7[entity_3] = '5'; // Tag.BUFWRITE_COND_SAFE
    int entity_6;             // Tag.BODY
    int entity_8;             // Tag.BODY
    char entity_2[24];        // Tag.BODY
    char entity_0[55];        // Tag.BODY
    entity_8 = 3;             // Tag.BODY
    entity_6 = 63;            // Tag.BODY
    entity_2[entity_8] = 'Q'; // Tag.BUFWRITE_TAUT_SAFE
    entity_0[entity_6] = 'P'; // Tag.BUFWRITE_TAUT_UNSAFE
    return 0;                 // Tag.BODY
}                             // Tag.OTHER

  

原数据集生成的原理

  先贴generate.py(主要是通过这个文件生成代码)的main函数部分代码

    outdir = args.outdir
    seed = int(args.seed)
    num_instances = int(args.num_instances)
    taut_only = args.taut_only
    linear_only = args.linear_only

    # check paths
    outdir = os.path.abspath(os.path.expanduser(outdir))
    if not os.path.isdir(outdir):
        raise OSError("outdir does not exist: '{}'".format(outdir))

    # set seed
    if seed != -1:
        random.seed(seed)

    generators = [gen_cond_example, gen_while_example, gen_for_example,
                  gen_fv_cond_example, gen_fv_while_example, gen_fv_for_example]
    if linear_only:
        generators = [gen_tautonly_linear_example]
    num_generators = len(generators)

    # Generate metadata only if the metadata_file argument is present
    generate_metadata = args.metadata_file is not None
    # This dict is used to store instance metadata
    tag_metadata = {}
    inst_num = 0

    while inst_num < num_instances:
        # generate example
        gen = generators[inst_num % num_generators]
        if gen is gen_tautonly_linear_example:
            instance_str, tags = gen()
        else:
            include_cond_bufwrite = not taut_only
            instance_str, tags = gen(
                include_cond_bufwrite=include_cond_bufwrite)

        # generate filename
        byte_obj = bytes(instance_str, 'utf-8')
        fname = hashlib.shake_128(byte_obj).hexdigest(FNAME_HASHLEN)
        fname = "{}.c".format(fname)
        if fname in tag_metadata:
	    # Collision, try again
            continue

        # insert record into metadata for this c file
        tag_metadata[fname] = [tag.value for tag in tags]
        inst_num += 1

        # write to file
        path = os.path.join(outdir, fname)
        with open(path, 'w') as f:
            f.write(instance_str)

  稍加分析可以发现,生成测试集的逻辑是遍历generators数组,然后挨个执行里面的函数来生成c代码,现在我们的目标就很明确了,模仿gen_xxx_example函数使之生成我们想要的代码(其实还有别的方法,之后再讲),接着我们看看gen_cond_example函数是怎么写的

def gen_cond_example(include_cond_bufwrite=True):
    """Generate conditional example

    Returns:
        instance_str (str): str of code example
        tags (list of Tag): tag for each line representing buffer safety
    """
    anon_vars = _get_anon_vars()
    buf_var, idx_var, thresh_var = anon_vars[:3]
    dummy_vars = anon_vars[3:]
    thresh = random.randrange(MAX_IDX)
    idx_init = random.randrange(MAX_IDX)
    buf_len = random.randrange(MAX_IDX)
    true_idx = random.randrange(MAX_IDX)
    false_idx = random.randrange(MAX_IDX)
    char = _get_char()
    substitutions = {
        'buf_var': buf_var,
        'idx_var': idx_var,
        'buf_len': buf_len,
        'thresh': thresh,
        'thresh_var': thresh_var,
        'idx_init': idx_init,
        'true_idx': true_idx,
        'false_idx': false_idx,
        'char': char
    }
    main_lines = templates.COND_MAIN_LINES
    cond = idx_init < thresh
    safe = ((cond and (true_idx < buf_len)) or
            (not cond and (false_idx < buf_len)))
    dec_init_pairs = templates.COND_DEC_INIT_PAIRS

    return _assemble_general_example(dec_init_pairs, main_lines, dummy_vars,
                                     safe, substitutions,
                                     include_cond_bufwrite)

  这个函数其实就是先通过_get_anon_vars获取十个随机的变量,指的是变量名是随机的,然后拿前三个分别作为buf_var, idx_var, thresh_var,后面几个暂时放在dummy_vars中,依样画葫芦就行,safe那行代码就是用来判断数组访问是否越界的,最后 我们会看到其中用了templates这个类,我们再去template.py看看,大概是长这个样子

BUFWRITE_LINES = ["$buf_var[$idx_var] = '$char';"]

# templates for functions without free variables

COND_DEC_INIT_PAIRS = [
    ("char $buf_var[$buf_len];", None),
    ("int $idx_var;", "$idx_var = $idx_init;"),
    ("int $thresh_var;", "$thresh_var = $thresh;")
]
COND_MAIN_LINES = [
    "if($idx_var < $thresh_var){",
    "$idx_var = $true_idx;",
    "} else {",
    "$idx_var = $false_idx;",
    "}"
]

WHILE_DEC_INIT_PAIRS = [
    ("char $buf_var[$buf_len];", None),
    ("int $idx_var;", "$idx_var = $idx_init;"),
    ("int $max_var;", "$max_var = $max_idx;")
]
WHILE_MAIN_LINES = [
    "while($idx_var < $max_var){",
    "$idx_var++;",
    "}"
]

  其实就是个模板,所以我们需要给自己准备的错误类型也弄个模板,我就放个简单的上溢出的模板吧。

ADD_DEC_INIT_PAIRS = [
    ("int $var1;", "$var1 = $var1_value;"),
    ("int $var2;", "$var2 = $var2_value;")
]
ADD_MAIN_LINES = [
    "$var1 = $var1 + $var2;"
]

  然后我们继续看_assemble_general_example函数,这个函数代码如下

def _assemble_general_example(dec_init_pairs, main_lines, dummy_vars,
                              safe, substitutions, include_cond_bufwrite):
    """Get instance lines, convert to string, generate tags

    Args:
        dec_init_pairs (list of tuple): declaration/initialization statements,
            e.g. those in templates.py
        main_lines (list of str): lines with the conditional or loop,
            e.g. those in templates.py
        dummy_vars (list of str): variable names available for use
        safe (bool): whether the conditional buffer write is safe
        substitutions (dict): names to substitute into templates
        include_cond_bufwrite (bool): whether to include the
            control flow-sensitive buffer write

    Returns:
        instance_str (str): str of code example
        tags (list of Tag): tag for each line representing buffer safety

    Ensures:
        len(instance_str.split("\n")) == len(tags)
    """
    if include_cond_bufwrite:
        # copy to avoid changing the template list due to aliasing
        main_lines = main_lines[:]
        main_lines += templates.BUFWRITE_LINES
    else:
        safe = None

    lines, body_tags = _get_lines(dec_init_pairs, main_lines,
                                  dummy_vars, safe, include_cond_bufwrite)
    tags = _get_tags(body_tags)
    instance_str = _get_instance_str(lines, substitutions,
                                     templates.FUNC_TMPL_STR, tags)
    return instance_str, tags

  从中间的if语句开始是正文,就是根据include_cond_bufwrite决定是不是要在main_lines后面加依据访问数组的语句,include_cond_bufwrite是指有没有条件,循环控制流存在,如果有就在控制流结束的地方添加数组访问语句。_get_instance_str是用实际的指代替模板的响应变量,就是填值的函数,先看看_get_lines函数

def _get_lines(dec_init_pairs, main_lines, dummy_vars, safe,
               include_cond_bufwrite):
    """Create full body lines with setup, main content, and dummy interaction

    Args:
        dec_init_pairs (list of tuple)
        main_lines (list of str): lines that use the declared vars
        dummy_vars (list of str): variable names available for dummy use
        safe (bool): whether the query line access is safe (for tags)
            or None, if no conditional query line should be added
        include_cond_bufwrite (bool): whether to include the
            control flow-sensitive buffer write

    Returns:
        lines (list of str)
        body_tags (list of Tag instances): tags for each body line
    """
    setup_lines = _get_setup_lines(dec_init_pairs)
    lines = setup_lines + main_lines

    # construct body tags before adding dummies
    body_tags = [Tag.BODY for _ in lines]
    if include_cond_bufwrite:
        query_tag = Tag.BUFWRITE_COND_SAFE if safe else Tag.BUFWRITE_COND_UNSAFE
        body_tags[-1] = query_tag

    min_num_dummies = 0 if include_cond_bufwrite else MIN_NUM_DUMMIES_TAUTONLY
    num_dummies = random.randrange(min_num_dummies, MAX_NUM_DUMMIES + 1)
    lines, body_tags = _insert_dummies(
        setup_lines, main_lines, dummy_vars, num_dummies, body_tags,
        include_cond_bufwrite)

    return lines, body_tags

  正文从setup_lines开始,比较重要的是 if include_cond_bufwrite这一条件语句,指的是如果有控制流,那么query_tag的位置应该在body_tags的最后,因为如果有控制流的话整个文件先是生成控制流对应的代码,然后随机再生成一些代码,此时就在这两个过程之间。还没有生成后面的随机代码,所以query_tag一定要放在body_tags的最后。num_dummies是剩余可用的变量名,程序一开始设置了只有10个还是多少个变量可以使用,前面生成控制流代码已经用去了一部分,剩下的用来生成控制无关(结果不依赖于控制流)的代码。很明显我们需要在这个 if include_cond_bufwrite后面加上我们新加的tag,我们看看sa_tag.py文件(我就不贴出来了,内容很简单),会发现其实这里面就定义了一些tag的常量,我们加上自己的

    MATH_OVERFLOW_SAFE = 6
    MATH_OVERFLOW_UNSAFE = 7

  ,然后回到_get_lines修改if语句

    if include_math_overflow:
        query_tag = Tag.MATH_OVERFLOW_SAFE if safe else Tag.MATH_OVERFLOW_UNSAFE
        body_tags[-1] = query_tag
    elif include_cond_bufwrite:
        query_tag = Tag.BUFWRITE_COND_SAFE if safe else Tag.BUFWRITE_COND_UNSAFE
        body_tags[-1] = query_tag

  include_math_overflow是作为参数传递过来的,默认为False,只有调用自己写的gen_xxx_example才会设置为Ture

类似的,我们看看_get_lines最后调用的的_insert_dummies函数

def _insert_dummies(setup_lines, main_lines, dummy_vars, num_dummies,
                    body_tags, include_cond_bufwrite
                    ):
    """Insert dummy array declare/set pairs (all safe sets)

    Args:
        setup_lines (list of str): declaration and initialization lines
        main_lines (list of str): control flow lines
        dummy_vars (list of str): variable names available for dummy use
        num_dummies (int): number of dummy vars to insert
        body_tags (list of Tag instances): tags before adding dummies
        include_cond_bufwrite (bool): whether to include the
            control flow-sensitive buffer write

    Returns:
        lines (list of str): with dummy dec/set pairs added
        body_tags (list of Tag instances): with tags added for dummy lines
    """
    lines = setup_lines + main_lines

    # first line of control flow, inclusive
    control_flow_start = len(setup_lines)
    # last line of control flow, exclusive
    control_flow_end = len(setup_lines + main_lines)
    if include_cond_bufwrite :
        control_flow_end -= 1

    for _ in range(num_dummies):
        (lines, dummy_vars, body_tags, control_flow_start, control_flow_end
         ) = _insert_referential_dummy(
                lines, dummy_vars, body_tags, control_flow_start,
                control_flow_end)

    return lines, body_tags

  我们只关心和include_cond_bufwrite相关的地方,就是 

    if include_cond_bufwrite :
        control_flow_end -= 1

  这一句,这句的意思是如果存在控制流e,控制流结束的位置减1。上面说过如果存在控制流的话会在代码最后(生成控制无关代码之前)添加数组访问语句,此时control_flow_end 默认标识的位置为代码末尾,因此要做修正,我设计的上溢出与控制流无关,因此不需要修改,所以后面加上and not include_math_overflow就可以了,同样地include_math_overflow也是作为参数传递过来,默认False。

  最后我们终于要添加gen_xxx_example函数了,我自己模仿写的代码是这样的

def gen_add_overflow_example(include_cond_bufwrite=True):
    """Generate conditional example

    Returns:
        instance_str (str): str of code example
        tags (list of Tag): tag for each line representing buffer safety
    """
    anon_vars = _get_anon_vars()
    var1, var2 = anon_vars[:2]
    dummy_vars = anon_vars[2:]

    var1_value =2147483647 - 2147483647 // random.randrange(1,5)
    var2_value =2147483647 - 2147483647 // random.randrange(1,5)
    substitutions = {
        'var1': var1,
        'var2': var2,
        'var1_value': var1_value,
        'var2_value': var2_value,
    }
    main_lines = templates.ADD_MAIN_LINES

    check_value = var1_value + var2_value

    safe = check_value <= 2147483647
    dec_init_pairs = templates.ADD_DEC_INIT_PAIRS

    return _assemble_general_example(dec_init_pairs, main_lines, dummy_vars,
                                     safe, substitutions,
                                     include_cond_bufwrite,True)

  逻辑很简单,就不解释了,弄完之后记得在generators加上这个函数然后注释掉test(),不然没法生成代码。(如果发现生成的数量很少可以往generators多加几个这个函数或者只用这个函数)。下面是我生成的一个代码

#include <stdlib.h>                 // Tag.OTHER
#include <stdio.h>                  // Tag.OTHER
int main()                          // Tag.OTHER
{                                   // Tag.OTHER
    int entity_8;                   // Tag.BODY
    int entity_5;                   // Tag.BODY
    entity_8 = 0;                   // Tag.BODY
    entity_5 = 1610612735;          // Tag.BODY
    int entity_2;                   // Tag.BODY
    entity_2 = 60;                  // Tag.BODY
    entity_5 = entity_5 + entity_8; // Tag.MATH_OVERFLOW_SAFE
    char entity_0[51];              // Tag.BODY
    entity_0[entity_2] = 'n';       // Tag.BUFWRITE_TAUT_UNSAFE
    return 0;                       // Tag.BODY
}                                   // Tag.OTHER

  

posted @ 2018-12-24 22:54  velor2012  阅读(174)  评论(1)    收藏  举报