1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
// use variable names from paper, namely registers M
#![allow(non_snake_case)]

#![crate_name = "basichll"]
#![crate_type = "lib"]

/* -------------------- std libs ------------------- */
use std::cmp;
use std::fmt;
use std::hash::{ Hash, Hasher, SipHasher };
use std::iter::{ repeat, Iterator };
/* ------------------------------------------------- */

/* -------------------- helpers -------------------- */

fn alpha (m: usize) -> f64 {
    match m {
        16 => 0.673,
        32 => 0.697,
        64 => 0.709,
        _  => 0.7213/(1f64 + 1.079/(m as f64))
    }
}

fn leftmost_one_bit (v: u64) -> usize {
    let mut counter: usize = 0;
    let mut shiftable = v;

    while shiftable > 0 {
        shiftable >>= 1;
        counter += 1;
    }

    64 - counter
}

/* ------------------------------------------------- */

/* --------------- hyper log log ------------------- */

/// The `HLL` struct stores the underlying registers
/// of the HyperLogLog, with `m` many registers.
///
/// The value for `m` is derived from the desired error
/// of the estimation. The error is considered to be
/// 1.04 / sqrt(2 ^ bits). So, given a desired 1% error
/// rate, we result in around 12 bits per register,
/// producing an underlying M of 2^12 bits (i.e. 4kb).
pub struct HLL {
    alpha: f64,
    b: u32,
    m: usize,
    M: Vec<u8>,
    // [!] this is pretty much a hack until I can think of something
    //     better to do for the monoid instance. Basically we need the
    //     zero value to always satisfy the left and right identity laws
    //     but because you need to have the number of registers be
    //     equivalent to do addition, it implies you need a distinct
    //     monoid instance per HLL configuration (i.e. an HLL of 4
    //     registers has a distinct zero from a HLL of 16).
    //
    //     Because I can't really do dependent types in rust, we instead
    //     have a distinguished zero element, that is allowed to bypass
    //     the assertions on the equivalent sizes...
    //
    //     For reference, Ed Kmett's haskell implementation actually does
    //     use higher kinded types and some tricky reflection to make the
    //     monoid instance be parameterised by a config type that is itself
    //     typed to a Peano natural encoded type. Twitter's algebird instance
    //     declares a separate class that encapsulates the notion of
    //     a "HyperLogLogMonoid" instance, and instances of that class are
    //     instantiated with the register size (i.e. the 'instance' for monoid
    //     is just an 'instance' of the class, for two interpretations of
    //     'instance').
    isZero: bool
}

impl HLL {
    /// Convenience function to produce a HLL with
    /// one hundred and twenty eight registers.
    pub fn one_hundred_twenty_eight () -> HLL {
        HLL::new(0.09192)
    }

    /// Create a new HLL with the desired standard error.
    /// Some examples might be:
    ///
    ///     bits    size    error
    ///     12      4096    0.0163
    ///     13      8192    0.0115
    ///     14      16384   0.0081
    ///
    /// The error must be between 0.0 and 1.0. Beware using a stupidly
    /// small error size will grow beyond e.g. isize at less than 0.03
    /// (if you need something that isn't an estimate probably don't use
    /// a hyperloglog).
    pub fn new(error: f64) -> HLL {
        assert!(error > 0.0 && error < 1.0);
        // error = 1.04 / sqrt(m)
        let m = ((1.04/error) * (1.04/error)).floor() as usize;
        let b = (m as f64).log2() as u32;

        HLL {
            alpha: alpha(m),
            b: b,
            m: m,
            M: repeat(0u8).take(m).collect(),
            isZero: false
        }
    }

    /// Add an element into the hyperloglog estimate.
    /// We require the type of value to be able to be hashed.
    pub fn insert<T: Hash>(&mut self, val: &T) {
        let mut hasher = SipHasher::new();
        val.hash(&mut hasher);
        let hash = hasher.finish();

        // j is the first b many bits
        let j = (hash >> (64 - self.b)) as usize;
        // w is the remaining bits (i.e. b -> 64)
        let w = hash & (2u64.pow(64 - self.b) - 1) as u64;
        let rho = leftmost_one_bit(w) as u8;

        self.M[j] = cmp::max(self.M[j], rho);
    }

    /// Return the estimated cardinality of the observed set
    /// of elements.
    pub fn count(&self) -> f64 {
        self.range_correction(self.raw_estimate())
    }

    fn raw_estimate(&self) -> f64 {
        let sum: f64 = self.M.iter().fold (0.0, |p, &r| p + 2.0f64.powi(-(r as i32)));

        self.alpha as f64 * ((self.m  * self.m) as f64) * (1.0 / sum)
    }

    fn empty_registers (&self) -> usize {
        self.M.iter().filter(|&r| *r == 0).count()
    }

    fn range_correction(&self, e: f64) -> f64 {
        let twoTo32: f64 = 2.0f64.powi(32) as f64;

        // small range correction
        if e <= (5.0*(self.m as f64))/2.0 {
            let v  = self.empty_registers() as f64;
            let fm = self.m as f64;

            return match v {
                0.0 => e,
                _   => fm * (fm / v).ln()
            }
        }

        // medium range (no) correction
        if e <= twoTo32/30.0 {
            return e;
        }

        -twoTo32 * (1.0 - e/twoTo32).ln()
    }

    /// Access a copy of the underlying registers (this is mainly
    /// here just for debugging/testing... its unlikely you'll need
    /// this access typically).
    pub fn registers(&self) -> Vec<u8> {
        self.M.clone()
    }

    /// Copies this instance into a new HLL.
    /// Doesn't do anything tricky (i.e. this will entirely
    /// re-allocate the underlying registers).
    pub fn clone (&self) -> HLL {
        HLL {
            alpha: self.alpha,
            b: self.b,
            m: self.m,
            M: self.M.clone(),
            isZero: self.isZero
        }
    }

    /// A completely zeroed HLL. Not particularly useful
    /// except as an identity element in `Add`.
    pub fn empty () -> HLL {
        HLL {
            alpha: 0.0,
            m: 0,
            b: 0,
            M: Vec::new(),
            isZero: true
        }
    }

}

/* ------------------------------------------------- */

/* --------------- trait instances ----------------- */

impl std::fmt::Display for HLL {
    fn fmt(&self, formatter: &mut fmt::Formatter) -> fmt::Result {
        write!(formatter, "α: {}, b: {}, m: {}", self.alpha, self.b, self.m)
    }
}

impl std::fmt::Debug for HLL {
    fn fmt(&self, formatter: &mut fmt::Formatter) -> fmt::Result {
        write!(formatter, "α: {}, b: {}, m: {}", self.alpha, self.b, self.m)
    }
}

/// A HLL is considered equal to another HLL when its
/// configuration and registers are exactly identical.
impl std::cmp::Eq for HLL {}
impl std::cmp::PartialEq for HLL {
    fn eq(&self, other: &HLL) -> bool {
           self.alpha  == other.alpha
        && self.m      == other.m
        && self.b      == other.b
        && self.M      == other.M
        && self.isZero == other.isZero
    }
}

fn zipWith<R, U: Iterator, C: Fn(U::Item, U::Item) -> R> (combo: C, left: U, right: U)
   -> Vec<R> {
    left.zip(right).map(| (l, r) | combo(l, r)).collect()
}

fn mergeRegisters (first: &Vec<u8>, second: &Vec<u8>) -> Vec<u8> {
    zipWith(|&l, &r| std::cmp::max(l, r), first.iter(), second.iter())
}

/// Adding together two HLLs produces a new HLL where
/// the larger value of each register has been selected.
impl<'a> std::ops::Add<&'a HLL> for &'a HLL {
    type Output = HLL;

    fn add(self, other: &'a HLL) -> HLL {
        // [!] gross...
        if self.isZero { return other.clone(); }
        if other.isZero { return self.clone(); }

        assert!(self.alpha == other.alpha);
        assert!(self.b == other.b);
        assert!(self.m == other.m);

        HLL {
            alpha: self.alpha,
            b: self.b,
            m: self.m,
            M: mergeRegisters(&self.M, &other.M),
            isZero: false
        }
    }
}

/* ------------------------------------------------- */