# 蓄水池抽样及实现

。。。

Init : a reservoir with the size： k
for    i= k+1 to N
M=random(1, i);
if( M < k)
SWAP the Mth value and ith value
end for


每次都是以 k/i 的概率来选择

1.当i=k+1的时候，蓄水池的容量为k，第k+1个元素被选择的概率明显为k/(k+1), 此时前k个元素出现在蓄水池的概率为 k/(k+1), 很明显结论成立。
2.假设当 j=i 的时候结论成立，此时以 k/i 的概率来选择第i个元素，前i-1个元素出现在蓄水池的概率都为k/i。

①.由2知道在第i+1次选择前，任一前i个元素出现在蓄水池的概率都为k/i
②.考虑被替换的概率：

RandomSelect.h

#include<stdio.h>
#include<windows.h>
#include<vector>
#include<time.h>
#include<math.h>

using namespace std;

#define MAX_LENGTH 0x7fff

void reservoirSampling(long select_num,long* pos_select,long* neg_select,char* input_path);
void saveFile(long select_num,long* pos_select,long* neg_select,char* input_path,char* ouput_path);
void usage();

RandomSelect.cpp

/*
Author: SongQi
Create Time:2012/9/19

Function: A program which can randomly select samples from the input sample file.
It can be used to seperate training samples and testing samples.Now it only can select two-class samples.
Input:
1.input sample file path 2.output sample file path 3.sample number need to select for each postive and negative samples.

Output:
1.output sample file
*/

#include "stdafx.h"
#include "RandomSelect.h"
#include <fstream>

using namespace std;

int main(int argc, char** argv)
{
//input sample file path
char* input_path = NULL;
//output sample file path
char* ouput_path = NULL;
//selected sample number from the input file
long select_num=0;

if( argc != 4)
{
usage();
system("pause");
return 0;
}
else
{
input_path=argv[1];
ouput_path=argv[2];
select_num=atol(argv[3]);
}
long *pos_select=new long[select_num];
long *neg_select=new long[select_num];
//random select samples from the input file
reservoirSampling(select_num,pos_select,neg_select,input_path);
saveFile(select_num,pos_select,neg_select,input_path,ouput_path);
system("pause");
return 0;
}

void reservoirSampling(long select_num,long* pos_select,long* neg_select,char* input_path)
{
FILE *input_file=fopen(input_path,"r");
if(input_file==NULL)
{
printf("the input sample file does not exist!\n");
usage();
system("pause");
return;
}

fseek(input_file,0,SEEK_SET);    //set to the start of the file
int label=0;
long index=0;
long pos_count =0;
long neg_count =0;
srand( time(NULL) );
char stuff;
while(true)
{
if(fscanf(input_file,"%d ",&label)!=1)
break;
//printf("%d\n",label);RAND_MAX
if(label==0)
{
if(neg_count<select_num)
neg_select[neg_count]=index;
else
{
long is_select=rand()%(neg_count+1)+1;
if(is_select<=select_num)
neg_select[rand()%select_num]=index;
}
neg_count++;
}
else
{
if(pos_count<select_num)
pos_select[pos_count]=index;
else
{
long is_select=rand()%(pos_count+1)+1;
if(is_select<=select_num)
pos_select[rand()%select_num]=index;
}
pos_count++;
}
//printf("pos:%dneg:%d\n",pos_count,neg_count);
index++;
char* detect_buffer = new char[2]();
while(strcmp(detect_buffer,"\n")!=0)
}
printf("%d\n",index);
FILE *out_file1=fopen("pos_index.txt","w");
FILE *out_file2=fopen("neg_index.txt","w");
for(int i=0;i<select_num;i++)
{
fprintf(out_file1,"%d\n",pos_select[i]);
fprintf(out_file2,"%d\n",neg_select[i]);
}
fclose(out_file1);
fclose(out_file2);
fclose(input_file);
}

void saveFile(long select_num,long* pos_select,long* neg_select,char* input_path,char* ouput_path)
{
ifstream fin(input_path);
FILE *output_file=fopen(ouput_path,"w");
char line[MAX_LENGTH];
long index=0;
while( fin.getline(line, MAX_LENGTH))
{
//printf("%s",line);
for(long count=0;count<select_num;count++)
{
if(index==pos_select[count])
{
fprintf(output_file,"%s\n",line);
}
if(index==neg_select[count])
{
fprintf(output_file,"%s\n",line);
}
}
index++;
}
fclose(output_file);
return;
}

void usage()
{
printf(" arg1:input sample file path\n");
printf(" arg2:output sample file path\n");
printf(" arg3:sample number need to select for each postive and negative samples\n");
}

posted @ 2012-11-27 13:21  handspeaker  阅读(19134)  评论(0编辑  收藏  举报