PHP实现内存、redis布隆过滤器

<?php

/**
 * 高性能布隆过滤器(使用扩展或位运算优化)
 */
class BloomFilter
{
    /** @var int 比特位长度 */
    protected $size;
    
    /** @var int 哈希函数个数 */
    protected $hashCount;
    
    /** @var string 位数组(使用字符串存储二进制数据) */
    protected $bitArray;
    
    /** @var array 哈希种子 */
    protected $seeds;
    
    /** @var bool 是否使用Redis */
    protected $useRedis = false;
    
    /** @var Redis|null Redis连接 */
    protected $redis = null;
    
    /** @var string Redis键名 */
    protected $redisKey = 'bloom:filter';
    
    /**
     * 构造函数
     * 
     * @param int $size 位数组大小(建议为预计元素数量的10-15倍)
     * @param int $hashCount 哈希函数数量(建议5-10个)
     * @param array $seeds 哈希种子数组
     * @param bool $useRedis 是否使用Redis存储
     */
    public function __construct($size = 1000000, $hashCount = 8, $seeds = null, $useRedis = false)
    {
        $this->size = $size;
        $this->hashCount = $hashCount;
        
        // 默认种子
        $this->seeds = $seeds ?: [31, 37, 41, 43, 47, 53, 59, 61, 67, 71];
        
        $this->useRedis = $useRedis;
        
        if ($useRedis) {
            $this->initRedis();
        } else {
            // 初始化位数组(每个字节8位)
            $byteCount = ceil($size / 8);
            $this->bitArray = str_repeat("\0", $byteCount);
        }
    }
    
    /**
     * 初始化Redis连接
     */
    protected function initRedis()
    {
        if (class_exists('Redis')) {
            $this->redis = new Redis();
            $this->redis->connect('127.0.0.1', 6379);
            // 使用Redis的bit操作,自动处理位数组
        } else {
            throw new Exception('Redis extension not installed');
        }
    }
    
    /**
     * 计算多个哈希值
     * 
     * @param string $value 要哈希的值
     * @return array 哈希值数组
     */
    protected function getHashes($value)
    {
        $hashes = [];
        $value = (string)$value;
        
        // 使用双重哈希算法生成多个哈希值
        $hash1 = crc32($value);
        $hash2 = fnv1aHash($value);
        
        for ($i = 0; $i < $this->hashCount; $i++) {
            $hashes[] = abs(($hash1 + $i * $hash2 + $this->seeds[$i % count($this->seeds)])) % $this->size;
        }
        
        return $hashes;
    }
    
    /**
     * 添加元素
     * 
     * @param string $value
     */
    public function add($value)
    {
        if (empty($value)) {
            return;
        }
        
        $hashes = $this->getHashes($value);
        
        if ($this->useRedis && $this->redis) {
            foreach ($hashes as $position) {
                $this->redis->setBit($this->redisKey, $position, 1);
            }
        } else {
            foreach ($hashes as $position) {
                $this->setBit($position);
            }
        }
    }
    
    /**
     * 设置位
     */
    protected function setBit($position)
    {
        $bytePos = (int)($position / 8);
        $bitPos = $position % 8;
        
        if ($bytePos < strlen($this->bitArray)) {
            $byte = ord($this->bitArray[$bytePos]);
            $byte |= (1 << $bitPos);
            $this->bitArray[$bytePos] = chr($byte);
        }
    }
    
    /**
     * 检查元素是否存在(可能有假阳性)
     * 
     * @param string $value
     * @return bool
     */
    public function has($value)
    {
        if (empty($value)) {
            return false;
        }
        
        $hashes = $this->getHashes($value);
        
        if ($this->useRedis && $this->redis) {
            foreach ($hashes as $position) {
                if (!$this->redis->getBit($this->redisKey, $position)) {
                    return false;
                }
            }
        } else {
            foreach ($hashes as $position) {
                if (!$this->getBit($position)) {
                    return false;
                }
            }
        }
        
        return true;
    }
    
    /**
     * 获取位值
     */
    protected function getBit($position)
    {
        $bytePos = (int)($position / 8);
        $bitPos = $position % 8;
        
        if ($bytePos >= strlen($this->bitArray)) {
            return false;
        }
        
        $byte = ord($this->bitArray[$bytePos]);
        return ($byte & (1 << $bitPos)) !== 0;
    }
    
    /**
     * 计算假阳性概率
     * 
     * @param int $itemCount 已插入元素数量
     * @return float 假阳性概率
     */
    public function falsePositiveProbability($itemCount)
    {
        if ($itemCount <= 0) {
            return 0;
        }
        
        // 公式:p = (1 - e^(-k*n/m))^k
        $k = $this->hashCount;
        $m = $this->size;
        $n = $itemCount;
        
        return pow(1 - exp(-$k * $n / $m), $k);
    }
    
    /**
     * 估算最优参数
     * 
     * @param int $expectedItems 预期元素数量
     * @param float $falsePositiveRate 期望的假阳性率
     * @return array [size, hashCount]
     */
    public static function estimateParameters($expectedItems, $falsePositiveRate = 0.01)
    {
        // 计算最优位数组大小
        $size = ceil(-($expectedItems * log($falsePositiveRate)) / pow(log(2), 2));
        
        // 计算最优哈希函数数量
        $hashCount = ceil(($size / $expectedItems) * log(2));
        
        return [
            'size' => (int)$size,
            'hashCount' => (int)$hashCount
        ];
    }
    
    /**
     * 清空布隆过滤器
     */
    public function clear()
    {
        if ($this->useRedis && $this->redis) {
            $this->redis->del($this->redisKey);
        } else {
            $byteCount = ceil($this->size / 8);
            $this->bitArray = str_repeat("\0", $byteCount);
        }
    }
    
    /**
     * 保存到文件(用于PHP版本持久化)
     */
    public function saveToFile($filename)
    {
        if (!$this->useRedis) {
            file_put_contents($filename, $this->bitArray);
        }
    }
    
    /**
     * 从文件加载
     */
    public function loadFromFile($filename)
    {
        if (!$this->useRedis && file_exists($filename)) {
            $this->bitArray = file_get_contents($filename);
        }
    }
    
    /**
     * 获取内存使用情况
     */
    public function getMemoryUsage()
    {
        if ($this->useRedis) {
            return 'Stored in Redis';
        }
        
        $bytes = strlen($this->bitArray);
        return [
            'bytes' => $bytes,
            'kilobytes' => round($bytes / 1024, 2),
            'megabytes' => round($bytes / 1024 / 1024, 2)
        ];
    }
    
    public function __destruct()
    {
        if ($this->redis) {
            $this->redis->close();
        }
    }
}

/**
 * FNV-1a哈希算法
 */
function fnv1aHash($string)
{
    $hash = 2166136261; // FNV_offset_basis
    $len = strlen($string);
    
    for ($i = 0; $i < $len; $i++) {
        $hash ^= ord($string[$i]);
        $hash += ($hash << 1) + ($hash << 4) + ($hash << 7) + ($hash << 8) + ($hash << 24);
    }
    
    return $hash & 0x7FFFFFFF; // 确保正数
}

/**
 * 测试函数
 */
function testBloomFilter()
{
    echo "=== 布隆过滤器性能测试 ===\n\n";
    
    // 计算最优参数
    $params = BloomFilter::estimateParameters(100000, 0.01);
    echo "估算参数:\n";
    echo "预期元素: 100,000\n";
    echo "期望假阳性率: 1%\n";
    echo "建议位数组大小: " . number_format($params['size']) . "\n";
    echo "建议哈希函数数量: {$params['hashCount']}\n\n";
    
    // 创建布隆过滤器
    $filter = new BloomFilter($params['size'], $params['hashCount']);
    
    echo "内存使用(初始): \n";
    print_r($filter->getMemoryUsage());
    echo "\n";
    
    // 添加测试数据
    $start = microtime(true);
    $testCount = 50000;
    
    echo "添加 {$testCount} 个元素...\n";
    for ($i = 0; $i < $testCount; $i++) {
        $filter->add("user_" . $i);
        $filter->add("email_" . $i . "@example.com");
        $filter->add("session_" . md5($i));
        
        // 每1000个显示进度
        if ($i % 10000 === 0 && $i > 0) {
            echo "已添加 {$i} 个...\n";
        }
    }
    
    $time = microtime(true) - $start;
    echo "添加完成,耗时: " . round($time, 3) . "秒\n";
    echo "平均每秒: " . round($testCount * 3 / $time) . "次操作\n\n";
    
    // 测试存在性
    echo "测试存在性检查...\n";
    $found = 0;
    $notFound = 0;
    
    for ($i = 0; $i < 1000; $i++) {
        if ($filter->has("user_" . $i)) {
            $found++;
        } else {
            $notFound++;
        }
    }
    
    echo "已存在元素检查:找到 {$found}/1000 个\n\n";
    
    // 测试假阳性率
    echo "测试假阳性率(检查不存在的数据)...\n";
    $falsePositives = 0;
    $testFalseCount = 10000;
    
    for ($i = 100000; $i < 100000 + $testFalseCount; $i++) {
        if ($filter->has("user_" . $i)) {
            $falsePositives++;
        }
    }
    
    $falsePositiveRate = ($falsePositives / $testFalseCount) * 100;
    echo "假阳性数: {$falsePositives}/{$testFalseCount}\n";
    echo "假阳性率: " . round($falsePositiveRate, 2) . "%\n";
    
    // 计算理论假阳性率
    $theoreticalRate = $filter->falsePositiveProbability($testCount * 3) * 100;
    echo "理论假阳性率: " . round($theoreticalRate, 2) . "%\n\n";
    
    // 内存使用
    echo "最终内存使用: \n";
    print_r($filter->getMemoryUsage());
}

/**
 * Redis版本测试
 */
function testRedisBloomFilter()
{
    if (!class_exists('Redis')) {
        echo "Redis扩展未安装,跳过Redis测试\n";
        return;
    }
    
    echo "\n=== Redis布隆过滤器测试 ===\n\n";
    
    $params = BloomFilter::estimateParameters(1000000, 0.01);
    $filter = new BloomFilter($params['size'], $params['hashCount'], null, true);
    
    echo "测试添加1000个元素到Redis...\n";
    $start = microtime(true);
    
    for ($i = 0; $i < 1000; $i++) {
        $filter->add("redis_item_" . $i);
    }
    
    $time = microtime(true) - $start;
    echo "完成,耗时: " . round($time, 3) . "秒\n";
    
    // 测试查询
    $found = 0;
    for ($i = 0; $i < 100; $i++) {
        if ($filter->has("redis_item_" . $i)) {
            $found++;
        }
    }
    echo "查询测试:找到 {$found}/100 个元素\n";
    
    $filter->clear();
}

// 运行测试
if (php_sapi_name() === 'cli' && isset($argv[0]) && basename($argv[0]) == basename(__FILE__)) {
    testBloomFilter();
    testRedisBloomFilter();
} else {
    // 网页访问时显示简单信息
    echo "<pre>";
    echo "布隆过滤器PHP实现\n";
    echo "使用方法:\n";
    echo "\$filter = new BloomFilter(1000000, 8);\n";
    echo "\$filter->add('value');\n";
    echo "\$exists = \$filter->has('value');\n";
    echo "</pre>";
}
posted @ 2026-01-10 14:50  朝阳1  阅读(2)  评论(0)    收藏  举报