计数滚动时间范围内的唯一ids

计数滚动时间范围内的唯一IDs

基础概念

计数滚动时间范围内的唯一IDs是指在动态时间窗口内统计不重复的标识符数量。这是一种常见的数据分析需求，特别是在处理实时数据流或时间序列数据时。

实现类型

1. 基于内存的实现

from collections import defaultdict
import time

class RollingUniqueCounter:
    def __init__(self, window_seconds):
        self.window_seconds = window_seconds
        self.id_timestamps = defaultdict(list)
    
    def add_id(self, id, timestamp=None):
        timestamp = timestamp or time.time()
        self.id_timestamps[id].append(timestamp)
        self._cleanup(timestamp)
    
    def count_unique(self, current_time=None):
        current_time = current_time or time.time()
        self._cleanup(current_time)
        return len(self.id_timestamps)
    
    def _cleanup(self, current_time):
        cutoff = current_time - self.window_seconds
        to_delete = []
        
        for id, timestamps in self.id_timestamps.items():
            # 保留在时间窗口内的记录
            new_timestamps = [t for t in timestamps if t >= cutoff]
            
            if new_timestamps:
                self.id_timestamps[id] = new_timestamps
            else:
                to_delete.append(id)
        
        for id in to_delete:
            del self.id_timestamps[id]

2. 基于Redis的实现

import redis
import time

class RedisRollingUniqueCounter:
    def __init__(self, redis_client, key_prefix, window_seconds):
        self.redis = redis_client
        self.key_prefix = key_prefix
        self.window_seconds = window_seconds
    
    def add_id(self, id):
        current_time = time.time()
        key = f"{self.key_prefix}:{id}"
        self.redis.zadd(key, {str(current_time): current_time})
        self.redis.expire(key, self.window_seconds)
    
    def count_unique(self):
        current_time = time.time()
        cutoff = current_time - self.window_seconds
        
        # 使用SCAN模式遍历所有键
        unique_count = 0
        cursor = '0'
        
        while True:
            cursor, keys = self.redis.scan(cursor, match=f"{self.key_prefix}:*")
            
            for key in keys:
                # 检查该键是否有在时间窗口内的记录
                count = self.redis.zcount(key, cutoff, current_time)
                if count > 0:
                    unique_count += 1
            
            if cursor == '0':
                break
        
        return unique_count

3. 基于时间序列数据库的实现

# 假设使用InfluxDB
from influxdb import InfluxDBClient

class TSDBRollingUniqueCounter:
    def __init__(self, db_client, measurement, window_seconds):
        self.db = db_client
        self.measurement = measurement
        self.window_seconds = window_seconds
    
    def add_id(self, id, timestamp=None):
        timestamp = timestamp or int(time.time() * 1e9)  # 纳秒时间戳
        data = [{
            "measurement": self.measurement,
            "tags": {"id": id},
            "time": timestamp,
            "fields": {"value": 1}
        }]
        self.db.write_points(data)
    
    def count_unique(self):
        query = f"""
        SELECT COUNT(DISTINCT("id")) FROM "{self.measurement}"
        WHERE time > now() - {self.window_seconds}s
        """
        result = self.db.query(query)
        return list(result.get_points())[0]['count']

应用场景

网站分析：统计过去1小时内的独立访客数
广告投放：计算最近30分钟内看到广告的唯一设备数
物联网监控：统计过去5分钟内的活跃设备数
金融风控：检测短时间内同一用户的异常交易次数
游戏服务器：监控当前在线的玩家数量

常见问题与解决方案

问题1：内存消耗过大

原因：存储了过多历史数据，没有及时清理过期记录

解决方案：

定期清理过期数据
使用概率数据结构如HyperLogLog
分片存储数据

from hyperloglog import HyperLogLog

class HLLRollingCounter:
    def __init__(self, window_seconds, precision=10):
        self.window_seconds = window_seconds
        self.hll = HyperLogLog(precision)
        self.last_cleanup = time.time()
    
    def add_id(self, id):
        current_time = time.time()
        self.hll.add(id)
        
        # 定期重置计数器
        if current_time - self.last_cleanup >= self.window_seconds:
            self.hll = HyperLogLog(self.hll.p)
            self.last_cleanup = current_time
    
    def count_unique(self):
        return len(self.hll)

问题2：计数不准确

原因：分布式环境下数据同步延迟或网络分区

解决方案：

使用一致性哈希分配计数任务
实现最终一致性模型
使用CRDT(无冲突复制数据类型)数据结构

问题3：高并发写入冲突

原因：多个线程/进程同时更新计数器

解决方案：

使用原子操作
实现乐观锁
采用队列缓冲写入请求

import threading
from queue import Queue

class ThreadSafeRollingCounter:
    def __init__(self, window_seconds):
        self.window_seconds = window_seconds
        self.id_timestamps = defaultdict(list)
        self.lock = threading.Lock()
        self.queue = Queue()
        self.worker_thread = threading.Thread(target=self._process_queue)
        self.worker_thread.daemon = True
        self.worker_thread.start()
    
    def add_id(self, id):
        self.queue.put((id, time.time()))
    
    def _process_queue(self):
        while True:
            id, timestamp = self.queue.get()
            with self.lock:
                self.id_timestamps[id].append(timestamp)
                self._cleanup(timestamp)
    
    def count_unique(self):
        with self.lock:
            self._cleanup(time.time())
            return len(self.id_timestamps)
    
    def _cleanup(self, current_time):
        # 同上
        pass