r19233@catbus: nickm | 2008-04-08 13:06:34 -0400

When we remove old routers, use Bloom filters rather than a digestmap-based set in order to tell which ones we absolutely need to keep.  This will save us roughly a kazillion little short-lived allocations for hash table entries.


svn:r14318
This commit is contained in:
Nick Mathewson 2008-04-08 17:06:41 +00:00
parent 0c9efd6a1e
commit a627407fcb
5 changed files with 201 additions and 20 deletions

View File

@ -51,6 +51,10 @@ Changes in version 0.2.1.1-alpha - 2008-??-??
- New --hush command-line option similar to --quiet. While --quiet
disables all logging to the console on startup, --hush limits the
output to messages of warning and error severity.
- Use a Bloom filter rather than a digest-based set to track which
descriptors we need to keep around when we're cleaning out old
router descriptors. This speeds up the computation significantly, and
may reduce fragmentation.
o Code simplifications and refactoring:
- Refactor code using connection_ap_handshake_attach_circuit() to

View File

@ -1192,3 +1192,23 @@ IMPLEMENT_ORDER_FUNC(find_nth_double, double)
IMPLEMENT_ORDER_FUNC(find_nth_uint32, uint32_t)
IMPLEMENT_ORDER_FUNC(find_nth_long, long)
/** Return a newly allocated digestset_t, optimized to hold a total of
* <b>max_elements</b> digests with a reasonably low false positive weight. */
digestset_t *
digestset_new(int max_elements)
{
int n_bits = 1u << (tor_log2(max_elements)+5);
digestset_t *r = tor_malloc(sizeof(digestset_t));
r->mask = n_bits - 1;
r->ba = bitarray_init_zero(n_bits);
return r;
}
/** Free all storage held in <b>set</b>. */
void
digestset_free(digestset_t *set)
{
bitarray_free(set->ba);
tor_free(set);
}

View File

@ -564,6 +564,49 @@ bitarray_is_set(bitarray_t *b, int bit)
return b[bit >> BITARRAY_SHIFT] & (1u << (bit & BITARRAY_MASK));
}
/** A set of digests, implemented as a Bloom filter. */
typedef struct {
int mask; /* One less than the number of bits in <b>ba</b>; always one less
* than a power of two. */
bitarray_t *ba; /* A bit array to implement the Bloom filter. */
} digestset_t;
#define BIT(n) ((n) & set->mask)
/** Add the digest <b>digest</b> to <b>set</b>. */
static INLINE void
digestset_add(digestset_t *set, const char *digest)
{
const uint32_t *p = (const uint32_t *)digest;
const uint32_t d1 = p[0] + (p[1]>>16);
const uint32_t d2 = p[1] + (p[2]>>16);
const uint32_t d3 = p[2] + (p[3]>>16);
const uint32_t d4 = p[3] + (p[0]>>16);
bitarray_set(set->ba, BIT(d1));
bitarray_set(set->ba, BIT(d2));
bitarray_set(set->ba, BIT(d3));
bitarray_set(set->ba, BIT(d4));
}
/** If <b>digest</b> is in <b>set</b>, return nonzero. Otherwise,
* <em>probably</em> return zero. */
static INLINE int
digestset_isin(const digestset_t *set, const char *digest)
{
const uint32_t *p = (const uint32_t *)digest;
const uint32_t d1 = p[0] + (p[1]>>16);
const uint32_t d2 = p[1] + (p[2]>>16);
const uint32_t d3 = p[2] + (p[3]>>16);
const uint32_t d4 = p[3] + (p[0]>>16);
return bitarray_is_set(set->ba, BIT(d1)) &&
bitarray_is_set(set->ba, BIT(d2)) &&
bitarray_is_set(set->ba, BIT(d3)) &&
bitarray_is_set(set->ba, BIT(d4));
}
#undef BIT
digestset_t *digestset_new(int max_elements);
void digestset_free(digestset_t* set);
/* These functions, given an <b>array</b> of <b>n_elements</b>, return the
* <b>nth</b> lowest element. <b>nth</b>=0 gives the lowest element;
* <b>n_elements</b>-1 gives the highest; and (<b>n_elements</b>-1) / 2 gives

View File

@ -2957,7 +2957,7 @@ _compare_duration_idx(const void *_d1, const void *_d2)
static void
routerlist_remove_old_cached_routers_with_id(time_t now,
time_t cutoff, int lo, int hi,
digestmap_t *retain)
digestset_t *retain)
{
int i, n = hi-lo+1;
unsigned n_extra, n_rmv = 0;
@ -2974,10 +2974,9 @@ routerlist_remove_old_cached_routers_with_id(time_t now,
tor_assert(!memcmp(ident, r->identity_digest, DIGEST_LEN));
}
#endif
/* Check whether we need to do anything at all. */
{
int mdpr = directory_caches_dir_info(get_options()) ? 5 : 2;
int mdpr = directory_caches_dir_info(get_options()) ? 2 : 1;
if (n <= mdpr)
return;
n_extra = n - mdpr;
@ -2993,7 +2992,7 @@ routerlist_remove_old_cached_routers_with_id(time_t now,
signed_descriptor_t *r_next;
lifespans[i-lo].idx = i;
if (r->last_listed_as_valid_until >= now ||
(retain && digestmap_get(retain, r->signed_descriptor_digest))) {
(retain && digestset_isin(retain, r->signed_descriptor_digest))) {
must_keep[i-lo] = 1;
}
if (i < hi) {
@ -3049,10 +3048,11 @@ routerlist_remove_old_routers(void)
time_t cutoff;
routerinfo_t *router;
signed_descriptor_t *sd;
digestmap_t *retain;
digestset_t *retain;
int caches = directory_caches_dir_info(get_options());
const networkstatus_t *consensus = networkstatus_get_latest_consensus();
const smartlist_t *networkstatus_v2_list = networkstatus_get_v2_list();
int n_expected_retain = 0;
trusted_dirs_remove_old_certs();
@ -3061,7 +3061,18 @@ routerlist_remove_old_routers(void)
// routerlist_assert_ok(routerlist);
retain = digestmap_new();
n_expected_retain = smartlist_len(consensus->routerstatus_list);
if (caches &&
networkstatus_v2_list && smartlist_len(networkstatus_v2_list)) {
SMARTLIST_FOREACH(networkstatus_v2_list, networkstatus_v2_t *, ns,
n_expected_retain += smartlist_len(ns->entries));
/*XXXX021 too much magic. */
n_expected_retain /= (smartlist_len(networkstatus_v2_list)/2+1);
}
//log_notice(LD_DIR,"n_expected_retain=%d",n_expected_retain);
retain = digestset_new(n_expected_retain);
cutoff = now - OLD_ROUTER_DESC_MAX_AGE;
/* Build a list of all the descriptors that _anybody_ lists. */
if (caches) {
@ -3077,7 +3088,7 @@ routerlist_remove_old_routers(void)
* system will obsolete this whole thing in 0.2.0.x. */
SMARTLIST_FOREACH(ns->entries, routerstatus_t *, rs,
if (rs->published_on >= cutoff)
digestmap_set(retain, rs->descriptor_digest, (void*)1));
digestset_add(retain, rs->descriptor_digest));
});
}
@ -3085,13 +3096,13 @@ routerlist_remove_old_routers(void)
if (consensus) {
SMARTLIST_FOREACH(consensus->routerstatus_list, routerstatus_t *, rs,
if (rs->published_on >= cutoff)
digestmap_set(retain, rs->descriptor_digest, (void*)1));
digestset_add(retain, rs->descriptor_digest));
}
/* If we have a bunch of networkstatuses, we should consider pruning current
* routers that are too old and that nobody recommends. (If we don't have
* enough networkstatuses, then we should get more before we decide to kill
* routers.) */
/* If we have nearly as many networkstatuses as we want, we should consider
* pruning current routers that are too old and that nobody recommends. (If
* we don't have enough networkstatuses, then we should get more before we
* decide to kill routers.) */
if (!caches ||
smartlist_len(networkstatus_v2_list) > get_n_v2_authorities() / 2) {
cutoff = now - ROUTER_MAX_AGE;
@ -3100,7 +3111,8 @@ routerlist_remove_old_routers(void)
router = smartlist_get(routerlist->routers, i);
if (router->cache_info.published_on <= cutoff &&
router->cache_info.last_listed_as_valid_until < now &&
!digestmap_get(retain,router->cache_info.signed_descriptor_digest)) {
!digestset_isin(retain,
router->cache_info.signed_descriptor_digest)) {
/* Too old: remove it. (If we're a cache, just move it into
* old_routers.) */
log_info(LD_DIR,
@ -3120,7 +3132,7 @@ routerlist_remove_old_routers(void)
sd = smartlist_get(routerlist->old_routers, i);
if (sd->published_on <= cutoff &&
sd->last_listed_as_valid_until < now &&
!digestmap_get(retain, sd->signed_descriptor_digest)) {
!digestset_isin(retain, sd->signed_descriptor_digest)) {
/* Too old. Remove it. */
routerlist_remove_old(routerlist, sd, i--);
}
@ -3128,11 +3140,9 @@ routerlist_remove_old_routers(void)
//routerlist_assert_ok(routerlist);
log_info(LD_DIR, "We have %d live routers and %d old router descriptors. "
"At most %d must be retained because of networkstatuses.",
log_info(LD_DIR, "We have %d live routers and %d old router descriptors.",
smartlist_len(routerlist->routers),
smartlist_len(routerlist->old_routers),
digestmap_size(retain));
smartlist_len(routerlist->old_routers));
/* Now we might have to look at routerlist->old_routers for extraneous
* members. (We'd keep all the members if we could, but we need to save
@ -3141,9 +3151,10 @@ routerlist_remove_old_routers(void)
* total number doesn't approach max_descriptors_per_router()*len(router).
*/
if (smartlist_len(routerlist->old_routers) <
smartlist_len(routerlist->routers) * (caches?4:2))
smartlist_len(routerlist->routers))
goto done;
/* Sort by identity, then fix indices. */
smartlist_sort(routerlist->old_routers, _compare_old_routers_by_identity);
/* Fix indices. */
for (i = 0; i < smartlist_len(routerlist->old_routers); ++i) {
@ -3171,7 +3182,7 @@ routerlist_remove_old_routers(void)
//routerlist_assert_ok(routerlist);
done:
digestmap_free(retain, NULL);
digestset_free(retain);
}
/** We just added a new set of descriptors. Take whatever extra steps

View File

@ -1922,6 +1922,40 @@ test_util_bitarray(void)
bitarray_free(ba);
}
static void
test_util_digestset(void)
{
smartlist_t *included = smartlist_create();
char d[DIGEST_LEN];
int i;
int ok = 1;
int false_positives = 0;
digestset_t *set;
for (i = 0; i < 1000; ++i) {
crypto_rand(d, DIGEST_LEN);
smartlist_add(included, tor_memdup(d, DIGEST_LEN));
}
set = digestset_new(1000);
SMARTLIST_FOREACH(included, const char *, cp,
if (digestset_isin(set, cp))
ok = 0);
test_assert(ok);
SMARTLIST_FOREACH(included, const char *, cp,
digestset_add(set, cp));
SMARTLIST_FOREACH(included, const char *, cp,
if (!digestset_isin(set, cp))
ok = 0);
test_assert(ok);
for (i = 0; i < 1000; ++i) {
crypto_rand(d, DIGEST_LEN);
if (digestset_isin(set, d))
++false_positives;
}
test_assert(false_positives < 50); /* Should be far lower. */
digestset_free(set);
}
/* stop threads running at once. */
static tor_mutex_t *_thread_test_mutex = NULL;
/* make sure that threads have to run at the same time. */
@ -3361,6 +3395,69 @@ bench_aes(void)
crypto_free_cipher_env(c);
}
static void
bench_dmap(void)
{
smartlist_t *sl = smartlist_create();
smartlist_t *sl2 = smartlist_create();
struct timeval start, end, pt2, pt3, pt4;
const int iters = 10000;
const int elts = 4000;
const int fpostests = 1000000;
char d[20];
int i,n=0, fp = 0;
digestmap_t *dm = digestmap_new();
digestset_t *ds = digestset_new(elts);
for (i = 0; i < elts; ++i) {
crypto_rand(d, 20);
smartlist_add(sl, tor_memdup(d, 20));
}
for (i = 0; i < elts; ++i) {
crypto_rand(d, 20);
smartlist_add(sl2, tor_memdup(d, 20));
}
printf("nbits=%d\n", ds->mask+1);
tor_gettimeofday(&start);
for (i = 0; i < iters; ++i) {
SMARTLIST_FOREACH(sl, const char *, cp, digestmap_set(dm, cp, (void*)1));
}
tor_gettimeofday(&pt2);
for (i = 0; i < iters; ++i) {
SMARTLIST_FOREACH(sl, const char *, cp, digestmap_get(dm, cp));
SMARTLIST_FOREACH(sl2, const char *, cp, digestmap_get(dm, cp));
}
tor_gettimeofday(&pt3);
for (i = 0; i < iters; ++i) {
SMARTLIST_FOREACH(sl, const char *, cp, digestset_add(ds, cp));
}
tor_gettimeofday(&pt4);
for (i = 0; i < iters; ++i) {
SMARTLIST_FOREACH(sl, const char *, cp, n += digestset_isin(ds, cp));
SMARTLIST_FOREACH(sl2, const char *, cp, n += digestset_isin(ds, cp));
}
tor_gettimeofday(&end);
for (i = 0; i < fpostests; ++i) {
crypto_rand(d, 20);
if (digestset_isin(ds, d)) ++fp;
}
printf("%ld\n",(unsigned long)tv_udiff(&start, &pt2));
printf("%ld\n",(unsigned long)tv_udiff(&pt2, &pt3));
printf("%ld\n",(unsigned long)tv_udiff(&pt3, &pt4));
printf("%ld\n",(unsigned long)tv_udiff(&pt4, &end));
printf("-- %d\n", n);
printf("++ %f\n", fp/(double)fpostests);
digestmap_free(dm, NULL);
digestset_free(ds);
SMARTLIST_FOREACH(sl, char *, cp, tor_free(cp));
SMARTLIST_FOREACH(sl2, char *, cp, tor_free(cp));
smartlist_free(sl);
smartlist_free(sl2);
}
static void
test_util_mempool(void)
{
@ -3850,6 +3947,7 @@ static struct {
SUBENT(util, datadir),
SUBENT(util, smartlist),
SUBENT(util, bitarray),
SUBENT(util, digestset),
SUBENT(util, mempool),
SUBENT(util, memarea),
SUBENT(util, strmap),
@ -3960,6 +4058,11 @@ main(int c, char**v)
return 0;
}
if (0) {
bench_dmap();
return 0;
}
atexit(remove_directory);
printf("Running Tor unit tests on %s\n", get_uname());