Commit eb9a199d authored by Philip Carns's avatar Philip Carns

add crush/vring patch to repo and update docs

- this includes moving the INSTALL file instructions into README.md
parent 4d8ec7dd
Pipeline #12 skipped
# default instructions
# -----------------------------------
./prepare # if cloned from git
./configure --prefix=/desired/installation/path
make -j 3
make install
make check # optional
# if you want to enable CRUSH support
#-------------------------------------
#
# NOTE: this has only been tested with Ceph version 0.94.3. The CRUSH API
# may be different in other versions of Ceph.
#
# NOTE: the following instructions assume that you have compiled Ceph in
# a separate subdirectory from the Ceph source code called "build". If that
# is not the case then just remove the "build/" prefix from files described
# below.
#
# compile and install CEPH to $CEPH_PREFIX
# copy src/crush/{crush|builder|mapper|hash}.h to $CEPH_PREFIX/include
# copy build/src/acconfig.h to $CEPH_PREFIX/include
# make directory $CEPH_PREFIX/include/include
# copy src/include/int_types.h to $CEPH_PREFIX/include/include
# follow above default instructions, except add --with-crush=$CEPH_PREFIX
# libch-placement
libch-placement is a modular consistent hashing library. It can be used by a
distributed storage system to access multiple hashing algorithms, distance
metrics, and virtual node settings using a consistent API.
## Basic installation
```
./prepare # if cloned from git
./configure --prefix=/desired/installation/path
make -j 3
make install
make check # optional
```
## To enable optional CRUSH support
NOTE: this has only been tested with Ceph version 0.94.3. The CRUSH API
may be different in other versions of Ceph.
NOTE: The CRUSH API is not normally exposed in the default Ceph header files.
The following instructions explain how to copy the necessary files into the Ceph
installation directory so that CRUSH can be used by an external library.
The following instructions assume that you have compiled Ceph in
a separate subdirectory from the Ceph source code called "build". If that
is not the case then just remove the "build/" prefix from files described
below.
* compile and install CEPH to $CEPH_PREFIX
* copy src/crush/{crush|builder|mapper|hash}.h to $CEPH_PREFIX/include
* copy build/src/acconfig.h to $CEPH_PREFIX/include
* make directory $CEPH_PREFIX/include/include
* copy src/include/int_types.h to $CEPH_PREFIX/include/include
* follow basic installation instructions for libch-placement, except add
--with-crush=$CEPH_PREFIX to the configure command line
## To enable "vring" bucket support
The vring bucket is an experimental bucket algorith for CRUSH. To enable it:
* apply the patches/ceph-0.94.3-crush-vring.patch to Ceph
* modify include/ch-placement.h to define CH_ENABLE_CRUSH_VRING
* follow the instructions above to install ch-placement with CRUSH support
diff -Naupr ceph-0.94.3-stock/src/crush/builder.c ceph-0.94.3-vring-bucket/src/crush/builder.c
--- ceph-0.94.3-stock/src/crush/builder.c 2015-08-26 13:50:52.000000000 -0400
+++ ceph-0.94.3-vring-bucket/src/crush/builder.c 2016-04-07 16:44:09.976740147 -0400
@@ -15,6 +15,9 @@
#define BUG_ON(x) assert(!(x))
+static int crush_add_vring_bucket_item(struct crush_bucket_vring *bucket, int item, int weight);
+static int crush_remove_vring_bucket_item(struct crush_bucket_vring *bucket, int item);
+
struct crush_map *crush_create()
{
struct crush_map *m;
@@ -681,6 +684,8 @@ crush_make_bucket(struct crush_map *map,
return (struct crush_bucket *)crush_make_straw_bucket(map, hash, type, size, items, weights);
case CRUSH_BUCKET_STRAW2:
return (struct crush_bucket *)crush_make_straw2_bucket(map, hash, type, size, items, weights);
+ case CRUSH_BUCKET_VRING:
+ return (struct crush_bucket *)crush_make_vring_bucket(map, hash, type, size, items, weights);
}
return 0;
}
@@ -913,6 +918,8 @@ int crush_bucket_add_item(struct crush_m
return crush_add_straw_bucket_item(map, (struct crush_bucket_straw *)b, item, weight);
case CRUSH_BUCKET_STRAW2:
return crush_add_straw2_bucket_item(map, (struct crush_bucket_straw2 *)b, item, weight);
+ case CRUSH_BUCKET_VRING:
+ return crush_add_vring_bucket_item((struct crush_bucket_vring *)b, item, weight);
default:
return -1;
}
@@ -1184,6 +1191,8 @@ int crush_bucket_remove_item(struct crus
return crush_remove_straw_bucket_item(map, (struct crush_bucket_straw *)b, item);
case CRUSH_BUCKET_STRAW2:
return crush_remove_straw2_bucket_item(map, (struct crush_bucket_straw2 *)b, item);
+ case CRUSH_BUCKET_VRING:
+ return crush_remove_vring_bucket_item((struct crush_bucket_vring *)b, item);
default:
return -1;
}
@@ -1318,6 +1327,9 @@ int crush_bucket_adjust_item_weight(stru
return crush_adjust_straw2_bucket_item_weight(map,
(struct crush_bucket_straw2 *)b,
item, weight);
+ case CRUSH_BUCKET_VRING:
+ /* TODO: implement this */
+ assert(0);
default:
return -1;
}
@@ -1458,10 +1470,197 @@ int crush_reweight_bucket(struct crush_m
return crush_reweight_straw_bucket(crush, (struct crush_bucket_straw *)b);
case CRUSH_BUCKET_STRAW2:
return crush_reweight_straw2_bucket(crush, (struct crush_bucket_straw2 *)b);
+ case CRUSH_BUCKET_VRING:
+ /* TODO: implement this */
+ assert(0);
default:
return -1;
}
}
/***************************/
+
+/* BEGIN vring bucket type implementation */
+
+/* vring bucket implementation notes, 2016
+ *
+ * The algorithm itself is described in:
+ *
+ * Philip Carns, Kevin Harms, John Jenkins, Misbah Mubarak, Robert Ross, and
+ * Christopher Carothers. Impact of Data Placement on Resilience in
+ * Large-Scale Object Storage Systems. In Proceedings of the 32nd
+ * International Conference on Massive Storage Systems and Technology
+ * (MSST 2016).
+ *
+ * This is a research prototype with the following limitations:
+ *
+ * - It has not been tuned for memory efficiency.
+ * - It does not implement weighting, though in principle this could be done
+ * by scaling the vnode factor for each element according to its weight.
+ */
+
+#define VRING_DEFAULT_VNODE_FACTOR 1024
+
+static int virtual_cmp(const void* a, const void *b)
+{
+ const struct crush_bucket_vring_entry *s_a = a;
+ const struct crush_bucket_vring_entry *s_b = b;
+
+ if(s_a->virtual_id < s_b->virtual_id)
+ return(-1);
+ else if(s_a->virtual_id > s_b->virtual_id)
+ return(1);
+ else
+ return(0);
+}
+
+static int crush_calc_vring(struct crush_bucket_vring *bucket)
+{
+ int i, j;
+
+ if(bucket->ring)
+ free(bucket->ring);
+
+ bucket->ring = malloc(bucket->vnode_factor*bucket->h.size*sizeof(*bucket->ring));
+ if (!bucket->ring)
+ return(-1);
+
+ for(i=0; i<bucket->h.size; i++)
+ {
+ for(j=0; j<bucket->vnode_factor; j++)
+ {
+ __u32 tmp_item = bucket->h.items[i];
+ __u32 tmp_vnode_idx = j;
+ __u32 top;
+ bucket->ring[j+i*bucket->vnode_factor].item = bucket->h.items[i];
+
+ /* to get 64 bit virtual ids we do two 32 bit hashes and shift */
+ bucket->ring[j+i*bucket->vnode_factor].virtual_id =
+ crush_hash32_3(bucket->h.hash, tmp_item, tmp_vnode_idx, 0);
+ top =
+ crush_hash32_3(bucket->h.hash, tmp_item, tmp_vnode_idx, 1);
+ bucket->ring[j+i*bucket->vnode_factor].virtual_id += (((uint64_t)top)<<32);
+ }
+ }
+
+ qsort(bucket->ring, bucket->h.size * bucket->vnode_factor,
+ sizeof(*bucket->ring), virtual_cmp);
+
+ for(i=0; i<bucket->h.size*bucket->vnode_factor; i++)
+ {
+ bucket->ring[i].array_idx = i;
+ bucket->ring[i].ring_size = bucket->h.size*bucket->vnode_factor;
+ }
+
+ return(0);
+}
+
+struct crush_bucket_vring *
+crush_make_vring_bucket(struct crush_map *map,
+ int hash,
+ int type,
+ int size,
+ int *items,
+ int *weights)
+{
+ struct crush_bucket_vring *bucket;
+ int i;
+
+ bucket = malloc(sizeof(*bucket));
+ if (!bucket)
+ return NULL;
+ memset(bucket, 0, sizeof(*bucket));
+ bucket->h.alg = CRUSH_BUCKET_VRING;
+ bucket->h.hash = hash;
+ bucket->h.type = type;
+ bucket->h.size = size;
+
+ bucket->h.items = malloc(sizeof(__s32)*size);
+ if (!bucket->h.items)
+ goto err;
+ bucket->h.perm = malloc(sizeof(__u32)*size);
+ if (!bucket->h.perm)
+ goto err;
+ bucket->ring = NULL;
+ bucket->vnode_factor = VRING_DEFAULT_VNODE_FACTOR;
+
+ bucket->h.weight = 0;
+ for (i=0; i<size; i++) {
+ bucket->h.items[i] = items[i];
+ bucket->h.weight += weights[i];
+ }
+
+ if (crush_calc_vring(bucket) < 0)
+ goto err;
+
+ return bucket;
+err:
+ free(bucket->ring);
+ free(bucket->h.perm);
+ free(bucket->h.items);
+ free(bucket);
+ return NULL;
+}
+
+static int crush_add_vring_bucket_item(struct crush_bucket_vring *bucket, int item, int weight)
+{
+ int newsize = bucket->h.size + 1;
+
+ void *_realloc = NULL;
+
+ if ((_realloc = realloc(bucket->h.items, sizeof(__s32)*newsize)) == NULL) {
+ return -ENOMEM;
+ } else {
+ bucket->h.items = _realloc;
+ }
+ if ((_realloc = realloc(bucket->h.perm, sizeof(__u32)*newsize)) == NULL) {
+ return -ENOMEM;
+ } else {
+ bucket->h.perm = _realloc;
+ }
+
+ bucket->h.items[newsize-1] = item;
+
+ if (crush_addition_is_unsafe(bucket->h.weight, weight))
+ return -ERANGE;
+
+ bucket->h.weight += weight;
+ bucket->h.size++;
+
+ return crush_calc_vring(bucket);
+}
+
+static int crush_remove_vring_bucket_item(struct crush_bucket_vring *bucket, int item)
+{
+ int newsize = bucket->h.size - 1;
+ unsigned i, j;
+
+ for (i = 0; i < bucket->h.size; i++) {
+ if (bucket->h.items[i] == item) {
+ bucket->h.size--;
+ for (j = i; j < bucket->h.size; j++) {
+ bucket->h.items[j] = bucket->h.items[j+1];
+ }
+ break;
+ }
+ }
+ if (i == bucket->h.size)
+ return -ENOENT;
+
+ void *_realloc = NULL;
+
+ if ((_realloc = realloc(bucket->h.items, sizeof(__s32)*newsize)) == NULL) {
+ return -ENOMEM;
+ } else {
+ bucket->h.items = _realloc;
+ }
+ if ((_realloc = realloc(bucket->h.perm, sizeof(__u32)*newsize)) == NULL) {
+ return -ENOMEM;
+ } else {
+ bucket->h.perm = _realloc;
+ }
+
+ return crush_calc_vring(bucket);
+}
+/* END vring bucket type implementation */
diff -Naupr ceph-0.94.3-stock/src/crush/builder.h ceph-0.94.3-vring-bucket/src/crush/builder.h
--- ceph-0.94.3-stock/src/crush/builder.h 2015-08-26 13:50:52.000000000 -0400
+++ ceph-0.94.3-vring-bucket/src/crush/builder.h 2016-02-02 12:08:36.556528140 -0500
@@ -40,5 +40,10 @@ crush_make_straw_bucket(struct crush_map
int hash, int type, int size,
int *items,
int *weights);
+struct crush_bucket_vring *
+crush_make_vring_bucket(struct crush_map *map,
+ int hash, int type, int size,
+ int *items,
+ int *weights);
#endif
diff -Naupr ceph-0.94.3-stock/src/crush/crush.h ceph-0.94.3-vring-bucket/src/crush/crush.h
--- ceph-0.94.3-stock/src/crush/crush.h 2015-08-26 13:50:52.000000000 -0400
+++ ceph-0.94.3-vring-bucket/src/crush/crush.h 2016-04-07 16:34:05.406299924 -0400
@@ -109,6 +109,7 @@ enum {
CRUSH_BUCKET_TREE = 3,
CRUSH_BUCKET_STRAW = 4,
CRUSH_BUCKET_STRAW2 = 5,
+ CRUSH_BUCKET_VRING = 6
};
extern const char *crush_bucket_alg_name(int alg);
@@ -168,8 +169,19 @@ struct crush_bucket_straw2 {
struct crush_bucket h;
__u32 *item_weights; /* 16-bit fixed point */
};
+
+struct crush_bucket_vring_entry {
+ uint64_t virtual_id;
+ int item;
+ int array_idx;
+ int ring_size;
+};
-
+struct crush_bucket_vring {
+ struct crush_bucket h;
+ struct crush_bucket_vring_entry *ring;
+ int vnode_factor;
+};
/*
* CRUSH map includes all buckets, rules, etc.
@@ -229,6 +241,7 @@ extern void crush_destroy_bucket_list(st
extern void crush_destroy_bucket_tree(struct crush_bucket_tree *b);
extern void crush_destroy_bucket_straw(struct crush_bucket_straw *b);
extern void crush_destroy_bucket_straw2(struct crush_bucket_straw2 *b);
+extern void crush_destroy_bucket_vring(struct crush_bucket_vring *b);
extern void crush_destroy_bucket(struct crush_bucket *b);
extern void crush_destroy_rule(struct crush_rule *r);
extern void crush_destroy(struct crush_map *map);
diff -Naupr ceph-0.94.3-stock/src/crush/mapper.c ceph-0.94.3-vring-bucket/src/crush/mapper.c
--- ceph-0.94.3-stock/src/crush/mapper.c 2015-08-26 13:50:52.000000000 -0400
+++ ceph-0.94.3-vring-bucket/src/crush/mapper.c 2016-02-02 11:54:48.523280060 -0500
@@ -37,6 +37,9 @@
#include "hash.h"
#include "crush_ln_table.h"
+static int bucket_vring_choose(struct crush_bucket_vring *bucket,
+ int x, int r);
+
/*
* Implement the core CRUSH mapping algorithm.
*/
@@ -369,6 +372,9 @@ static int crush_bucket_choose(struct cr
case CRUSH_BUCKET_STRAW2:
return bucket_straw2_choose((struct crush_bucket_straw2 *)in,
x, r);
+ case CRUSH_BUCKET_VRING:
+ return bucket_vring_choose((struct crush_bucket_vring *)in,
+ x, r);
default:
dprintk("unknown bucket %d alg %d\n", in->id, in->alg);
return in->items[0];
@@ -980,4 +986,55 @@ int crush_do_rule(const struct crush_map
return result_len;
}
+static int nearest_cmp(const void* key, const void *member)
+{
+ const uint64_t *obj = key;
+ const struct crush_bucket_vring_entry *entry = member;
+ const struct crush_bucket_vring_entry *next_entry = entry + 1;
+
+ if(*obj < entry->virtual_id)
+ return(-1);
+ if(*obj > entry->virtual_id)
+ {
+ /* are we on the last server already? */
+ if(entry->array_idx == entry->ring_size-1)
+ return(0);
+ /* is the oid also larger than the next server's id? */
+ if(next_entry->virtual_id < *obj)
+ return(1);
+ }
+
+ /* otherwise it is in our range */
+ return(0);
+}
+
+/* BEGIN vring bucket type implementation */
+
+static int bucket_vring_choose(struct crush_bucket_vring *bucket,
+ int x, int r)
+{
+ struct crush_bucket_vring_entry *entry;
+ uint64_t obj;
+ __u32 top;
+
+ /* generate an intermediate object id based on hash of incoming object id
+ * and replication factor
+ */
+ obj = crush_hash32_3(bucket->h.hash, x, r, 0);
+ top = crush_hash32_3(bucket->h.hash, x, r, 1);
+ obj += (((uint64_t)top)<<32);
+
+ /* binary search through ring to find server with greatest virtual node
+ * ID less than the OID
+ */
+ entry = bsearch(&obj, bucket->ring, bucket->h.size*bucket->vnode_factor,
+ sizeof(*bucket->ring), nearest_cmp);
+
+ /* if bsearch didn't find a match, then object belongs to last partition */
+ if(!entry)
+ entry = &bucket->ring[bucket->h.size * bucket->vnode_factor - 1];
+
+ return(entry->item);
+}
+/* END vring bucket type implementation */
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment