<?php
/**
* 高性能布隆过滤器(使用扩展或位运算优化)
*/
class BloomFilter
{
/** @var int 比特位长度 */
protected $size;
/** @var int 哈希函数个数 */
protected $hashCount;
/** @var string 位数组(使用字符串存储二进制数据) */
protected $bitArray;
/** @var array 哈希种子 */
protected $seeds;
/** @var bool 是否使用Redis */
protected $useRedis = false;
/** @var Redis|null Redis连接 */
protected $redis = null;
/** @var string Redis键名 */
protected $redisKey = 'bloom:filter';
/**
* 构造函数
*
* @param int $size 位数组大小(建议为预计元素数量的10-15倍)
* @param int $hashCount 哈希函数数量(建议5-10个)
* @param array $seeds 哈希种子数组
* @param bool $useRedis 是否使用Redis存储
*/
public function __construct($size = 1000000, $hashCount = 8, $seeds = null, $useRedis = false)
{
$this->size = $size;
$this->hashCount = $hashCount;
// 默认种子
$this->seeds = $seeds ?: [31, 37, 41, 43, 47, 53, 59, 61, 67, 71];
$this->useRedis = $useRedis;
if ($useRedis) {
$this->initRedis();
} else {
// 初始化位数组(每个字节8位)
$byteCount = ceil($size / 8);
$this->bitArray = str_repeat("\0", $byteCount);
}
}
/**
* 初始化Redis连接
*/
protected function initRedis()
{
if (class_exists('Redis')) {
$this->redis = new Redis();
$this->redis->connect('127.0.0.1', 6379);
// 使用Redis的bit操作,自动处理位数组
} else {
throw new Exception('Redis extension not installed');
}
}
/**
* 计算多个哈希值
*
* @param string $value 要哈希的值
* @return array 哈希值数组
*/
protected function getHashes($value)
{
$hashes = [];
$value = (string)$value;
// 使用双重哈希算法生成多个哈希值
$hash1 = crc32($value);
$hash2 = fnv1aHash($value);
for ($i = 0; $i < $this->hashCount; $i++) {
$hashes[] = abs(($hash1 + $i * $hash2 + $this->seeds[$i % count($this->seeds)])) % $this->size;
}
return $hashes;
}
/**
* 添加元素
*
* @param string $value
*/
public function add($value)
{
if (empty($value)) {
return;
}
$hashes = $this->getHashes($value);
if ($this->useRedis && $this->redis) {
foreach ($hashes as $position) {
$this->redis->setBit($this->redisKey, $position, 1);
}
} else {
foreach ($hashes as $position) {
$this->setBit($position);
}
}
}
/**
* 设置位
*/
protected function setBit($position)
{
$bytePos = (int)($position / 8);
$bitPos = $position % 8;
if ($bytePos < strlen($this->bitArray)) {
$byte = ord($this->bitArray[$bytePos]);
$byte |= (1 << $bitPos);
$this->bitArray[$bytePos] = chr($byte);
}
}
/**
* 检查元素是否存在(可能有假阳性)
*
* @param string $value
* @return bool
*/
public function has($value)
{
if (empty($value)) {
return false;
}
$hashes = $this->getHashes($value);
if ($this->useRedis && $this->redis) {
foreach ($hashes as $position) {
if (!$this->redis->getBit($this->redisKey, $position)) {
return false;
}
}
} else {
foreach ($hashes as $position) {
if (!$this->getBit($position)) {
return false;
}
}
}
return true;
}
/**
* 获取位值
*/
protected function getBit($position)
{
$bytePos = (int)($position / 8);
$bitPos = $position % 8;
if ($bytePos >= strlen($this->bitArray)) {
return false;
}
$byte = ord($this->bitArray[$bytePos]);
return ($byte & (1 << $bitPos)) !== 0;
}
/**
* 计算假阳性概率
*
* @param int $itemCount 已插入元素数量
* @return float 假阳性概率
*/
public function falsePositiveProbability($itemCount)
{
if ($itemCount <= 0) {
return 0;
}
// 公式:p = (1 - e^(-k*n/m))^k
$k = $this->hashCount;
$m = $this->size;
$n = $itemCount;
return pow(1 - exp(-$k * $n / $m), $k);
}
/**
* 估算最优参数
*
* @param int $expectedItems 预期元素数量
* @param float $falsePositiveRate 期望的假阳性率
* @return array [size, hashCount]
*/
public static function estimateParameters($expectedItems, $falsePositiveRate = 0.01)
{
// 计算最优位数组大小
$size = ceil(-($expectedItems * log($falsePositiveRate)) / pow(log(2), 2));
// 计算最优哈希函数数量
$hashCount = ceil(($size / $expectedItems) * log(2));
return [
'size' => (int)$size,
'hashCount' => (int)$hashCount
];
}
/**
* 清空布隆过滤器
*/
public function clear()
{
if ($this->useRedis && $this->redis) {
$this->redis->del($this->redisKey);
} else {
$byteCount = ceil($this->size / 8);
$this->bitArray = str_repeat("\0", $byteCount);
}
}
/**
* 保存到文件(用于PHP版本持久化)
*/
public function saveToFile($filename)
{
if (!$this->useRedis) {
file_put_contents($filename, $this->bitArray);
}
}
/**
* 从文件加载
*/
public function loadFromFile($filename)
{
if (!$this->useRedis && file_exists($filename)) {
$this->bitArray = file_get_contents($filename);
}
}
/**
* 获取内存使用情况
*/
public function getMemoryUsage()
{
if ($this->useRedis) {
return 'Stored in Redis';
}
$bytes = strlen($this->bitArray);
return [
'bytes' => $bytes,
'kilobytes' => round($bytes / 1024, 2),
'megabytes' => round($bytes / 1024 / 1024, 2)
];
}
public function __destruct()
{
if ($this->redis) {
$this->redis->close();
}
}
}
/**
* FNV-1a哈希算法
*/
function fnv1aHash($string)
{
$hash = 2166136261; // FNV_offset_basis
$len = strlen($string);
for ($i = 0; $i < $len; $i++) {
$hash ^= ord($string[$i]);
$hash += ($hash << 1) + ($hash << 4) + ($hash << 7) + ($hash << 8) + ($hash << 24);
}
return $hash & 0x7FFFFFFF; // 确保正数
}
/**
* 测试函数
*/
function testBloomFilter()
{
echo "=== 布隆过滤器性能测试 ===\n\n";
// 计算最优参数
$params = BloomFilter::estimateParameters(100000, 0.01);
echo "估算参数:\n";
echo "预期元素: 100,000\n";
echo "期望假阳性率: 1%\n";
echo "建议位数组大小: " . number_format($params['size']) . "\n";
echo "建议哈希函数数量: {$params['hashCount']}\n\n";
// 创建布隆过滤器
$filter = new BloomFilter($params['size'], $params['hashCount']);
echo "内存使用(初始): \n";
print_r($filter->getMemoryUsage());
echo "\n";
// 添加测试数据
$start = microtime(true);
$testCount = 50000;
echo "添加 {$testCount} 个元素...\n";
for ($i = 0; $i < $testCount; $i++) {
$filter->add("user_" . $i);
$filter->add("email_" . $i . "@example.com");
$filter->add("session_" . md5($i));
// 每1000个显示进度
if ($i % 10000 === 0 && $i > 0) {
echo "已添加 {$i} 个...\n";
}
}
$time = microtime(true) - $start;
echo "添加完成,耗时: " . round($time, 3) . "秒\n";
echo "平均每秒: " . round($testCount * 3 / $time) . "次操作\n\n";
// 测试存在性
echo "测试存在性检查...\n";
$found = 0;
$notFound = 0;
for ($i = 0; $i < 1000; $i++) {
if ($filter->has("user_" . $i)) {
$found++;
} else {
$notFound++;
}
}
echo "已存在元素检查:找到 {$found}/1000 个\n\n";
// 测试假阳性率
echo "测试假阳性率(检查不存在的数据)...\n";
$falsePositives = 0;
$testFalseCount = 10000;
for ($i = 100000; $i < 100000 + $testFalseCount; $i++) {
if ($filter->has("user_" . $i)) {
$falsePositives++;
}
}
$falsePositiveRate = ($falsePositives / $testFalseCount) * 100;
echo "假阳性数: {$falsePositives}/{$testFalseCount}\n";
echo "假阳性率: " . round($falsePositiveRate, 2) . "%\n";
// 计算理论假阳性率
$theoreticalRate = $filter->falsePositiveProbability($testCount * 3) * 100;
echo "理论假阳性率: " . round($theoreticalRate, 2) . "%\n\n";
// 内存使用
echo "最终内存使用: \n";
print_r($filter->getMemoryUsage());
}
/**
* Redis版本测试
*/
function testRedisBloomFilter()
{
if (!class_exists('Redis')) {
echo "Redis扩展未安装,跳过Redis测试\n";
return;
}
echo "\n=== Redis布隆过滤器测试 ===\n\n";
$params = BloomFilter::estimateParameters(1000000, 0.01);
$filter = new BloomFilter($params['size'], $params['hashCount'], null, true);
echo "测试添加1000个元素到Redis...\n";
$start = microtime(true);
for ($i = 0; $i < 1000; $i++) {
$filter->add("redis_item_" . $i);
}
$time = microtime(true) - $start;
echo "完成,耗时: " . round($time, 3) . "秒\n";
// 测试查询
$found = 0;
for ($i = 0; $i < 100; $i++) {
if ($filter->has("redis_item_" . $i)) {
$found++;
}
}
echo "查询测试:找到 {$found}/100 个元素\n";
$filter->clear();
}
// 运行测试
if (php_sapi_name() === 'cli' && isset($argv[0]) && basename($argv[0]) == basename(__FILE__)) {
testBloomFilter();
testRedisBloomFilter();
} else {
// 网页访问时显示简单信息
echo "<pre>";
echo "布隆过滤器PHP实现\n";
echo "使用方法:\n";
echo "\$filter = new BloomFilter(1000000, 8);\n";
echo "\$filter->add('value');\n";
echo "\$exists = \$filter->has('value');\n";
echo "</pre>";
}