Top-K:space saving algorithm

image

/*
 *
 *  Written by : Prashant Kumar Prajapati
 *  Time complexity : O(n)
 *  Space complexity : O(n)
 *
 */

 /*
  *
  *  This script implements a prototype of the space saving algorithm.
  *  Internally the algorithm uses a max-heap and a hash map for finding the number with the maximum count.
  *  The hash map has been implemented as a hash-table in this script.
  *  Input is taken from input.txt and output is written to output.txt.
  *  This script writes a number to output.txt file if it is present in more than 20% of the dataset
  *
  */

#include <stdio.h>
#include <stdlib.h>
#include <time.h>

struct node
{
    int key;
    int val;
    struct node* next;
};

struct heap_node
{
    int val;
    int cnt;
};

int size=0;
struct node database [ 500001 ];
struct heap_node heap [ 500001 ];

/*********LINKED LIST*******/

struct node* newNode (int key, int val)
{
    struct node* temp = (struct node*) malloc ( sizeof (struct node) );
    temp->key = key;
    temp->val = val;
    temp->next = NULL;
    return temp;
}

///void map_insert (int key, int val)
///{
///    int hash_key = hash (key);
///    list_insert (hash_key, val);
///}

///list_insert(hash2,size);
///hash2定位map中桶的位置,size定位counter的位置

void list_insert (int key, int val)
{
    if (database[ key ].val == -1)
    {
        database[ key ].val = 0;
        struct node* temp = newNode (key, val);
        database[ key ].next = temp;
    }
    else
    {
        struct node* temp = database[ key ].next;

        while (temp->next != NULL)
            temp = temp->next;

        temp->next = newNode (key, val);
    }
}

struct node* list_search (int key)
{
    if (database[ key ].val == -1)
        return newNode(key,-100000000);

    return database[ key ].next;
}

/*********HASH TABLE********/

const int md = 50000;

unsigned int hash (unsigned int x)
{
    unsigned int a = 636586, b = 546234;

    x = ((x >> 16) ^ x) * 0x45d9f3b;
    x = ((x >> 16) ^ x) * 0x45d9f3b;
    x = (x >> 16) ^ x;
    x = (a + (x * b)) % md;
    if (x <= 0)
        x = x + md;

    return x;
}

void map_insert (int key, int val)
{
    int hash_key = hash (key);
    list_insert (hash_key, val);
}
///输入:key 输出:对应的counter
int map_search (int key)
{
    int hash_key = hash (key);
    struct node* temp = list_search (hash_key);

    if (temp->val == -100000000)
        return -1;


    if (heap [ temp->val ].val == key)
        return temp->val;

    while (temp->next != NULL)
    {
        if(heap[temp->val].val==key)
            return temp->val;

        temp = temp->next;
    }
    return -1;
}
///更新map中的key对应的counter
void map_data_update (int key, int val)
{
    int hash_key = hash(key);
    struct node* temp = list_search (hash_key);
    if (temp->val == -100000000)
        return;

    if( heap[temp->val].val ==key )
    {
        temp->val = val;
        return;
    }

    while (temp->next != NULL)
    {
        if(heap[temp->val].val == key)
        {
            temp->val = val;
            return;
        }
        temp = temp->next;
    }
}
///把key1删掉,把key2添加进map和heap里。key2分配的counter的下标是:size
void map_hash_update (int key1, int key2)
{
    unsigned int hash1 = hash(key1);
    unsigned int hash2 = hash(key2);

    struct node* temp = list_search (hash1);
    struct node* temp2 = newNode(0,0);
    temp2->next = temp;

    if ( heap[temp->val].val == key1 )
    {
        if ( temp->next == NULL )
            database[hash1] = *(newNode(0,-1));
        else
            database[hash1].next = temp->next;

        free(temp);
        list_insert(hash2,size);

        return;
    }
    while (temp->next != NULL)
    {
        if (heap[temp->val].val == key1)
        {
            temp2->next = temp->next;
            free(temp);
            break;
        }
        temp2 = temp;
        temp = temp->next;
    }
    list_insert(hash2,size);
    ///hash2定位map中桶的位置,size定位counter的位置
}

void map_delete (int key)
{
    int hash_key = hash(key);
    struct node* temp = list_search(hash_key);
    struct node* temp2 = newNode(0,0);
    temp2->next = temp;

    if ( heap[temp->val].val == key )
    {
        if ( temp->next == NULL )
            database[hash_key] = *(newNode(0,-1));
        else
            database[hash_key].next = temp->next;

        free(temp);
        return;
    }

    while (temp->next != NULL)
    {
        if (heap[temp->val].val == key)
        {
            temp2->next = temp->next;
            free(temp);
            break;
        }
        temp2 = temp;
        temp = temp->next;
    }
}

/********** MAX - HEAP *********/

void swap (int a, int b)
{
    struct heap_node temp = heap[a];
    heap[a] = heap[b];
    heap[b] = temp;
}
///堆的中间元素发生变化,进行调整
void heapify (int position)
{
    int l = 2 * position;
    int r = 2 * position + 1;
    int ans = position;
    if ( ( l <= size ) && ( heap[l].cnt > heap[ans].cnt ) )
        ans = l;

    if ( ( r <= size ) && ( heap[r].cnt > heap[ans].cnt ) )
        ans = r;

    if (ans != position)
    {
        map_data_update ( heap[position].val, ans);
        map_data_update ( heap[ans].val, position);
        swap (ans, position);
        heapify (ans);
    }
}
///堆的最后一个元素发生变化,调整堆
void move (int position)
{
    while ( ( heap[position].cnt > heap[position/2].cnt ) && ( position > 1 ) )
    {
        map_data_update ( heap[position/2].val, position);
        map_data_update ( heap[position].val, position/2);
        swap ( position, position/2);
        position = position / 2;
    }
}
///为新的key分配counter。同时把(hash(key),counter)插入到map中。最后调整堆
void heap_insert (int val)
{
    struct heap_node to_add;
    to_add.val = val;
    to_add.cnt = 1;
    heap[++size] = to_add;
    map_insert (val, size);
    move (size);
}
///输入:key 操作:把key对应的counter+1(先通过map找的key对应的counter)
void heap_update (int val)
{
    int pos = map_search (val);
    heap[pos].val = val;
    heap[pos].cnt++;
    move(pos);
}
///输入:key 操作:把指向最后一个counter(heap[size])的key(最小key)变成指向要删除的key对应的counter。这样heap[size]就空下了。
///map也删除key。
void heap_delete (int val)
{
    int position = map_search (val);
    map_data_update ( heap[size].val, position);
    map_delete (heap[position].val);
    swap (position, size);
    heap[size].val = 0;
    heap[size].cnt = 0;
    size--;
    heapify (position);///堆的中间元素发生变化,进行调整
}
///访问堆顶元素
struct heap_node max_element(){
    return heap[1];
}

/******* SPACE SAVING ALGORITHM *******/

void space_saving (int key)
{
///hash-map表中key对应的位置已经关联到堆中的一个计数器了。直接更新对应的计数器
    if ( map_search (key) != -1)
        heap_update (key);
///hash-map中还没有当前数据包对应的流记录。
    else
    {
        ///且计数器的值没有达到,则为当前数据包分配一个车计数器
        if (size < 49999)
            heap_insert(key);///map_insert (val, size);在heap_insert(key);函数内

        ///计数器已经被分配完了?用新数据包换出最小的流记录,计数器值+1,调整堆。
        else
        {
            map_hash_update (heap[size].val, key);
            heap[size].val = key;
            heap[size].cnt++;
            move(size);
        }
    }
}

/********** MAIN FUNCTION ***********/

int main()
{
    int i;
    for ( i = 0; i <= 50000; i++)
        database[i] = *(newNode(0,-1));

    FILE *f1,*f2;
    f1 = fopen( "input.txt", "r");
    f2 = fopen( "output.txt", "w");

    if(f1)
    {
        int n, val, cnt = 0;
        fscanf ( f1, "%d", &n);
        for( i = 0; i < n; i++)
        {
            fscanf ( f1, "%d", &val);
            space_saving (val);
            if (max_element().cnt > n/5)
            {
                fprintf ( f2, "The number to delete is %d\n", max_element().val);
                heap_delete ( max_element().val);
                cnt++;
            }
        }
        fprintf ( f2, "No. of deleted numbers = %d\n", cnt);
    }
    return 0;
}

posted @ 2021-10-17 15:27  没有任何出路  阅读(536)  评论(0)    收藏  举报