Open garyfeng opened 1 year ago
downloading http://ann-benchmarks.com/nytimes-256-angular.hdf5, as it has 256 dimensions and 300k vectors, roughly in the ballpark of what we are looking for. Note you have to paste the above link to a new page, or use wget
to download. Clicking the link doesn't work.
reading HDF5 vector files:
def get_dataset(which):
hdf5_fn = get_dataset_fn(which)
try:
url = 'http://ann-benchmarks.com/%s.hdf5' % which
download(url, hdf5_fn)
except:
print("Cannot download %s" % url)
if which in DATASETS:
print("Creating dataset locally")
DATASETS[which](hdf5_fn)
hdf5_f = h5py.File(hdf5_fn, 'r')
# here for backward compatibility, to ensure old datasets can still be used with newer versions
# cast to integer because the json parser (later on) cannot interpret numpy integers
dimension = int(hdf5_f.attrs['dimension']) if 'dimension' in hdf5_f.attrs else len(hdf5_f['train'][0])
return hdf5_f, dimension
for random samples, use the random_float()
function:
See also how they handled mixing real and fake data:
DATASETS = {
'deep-image-96-angular': deep_image,
'fashion-mnist-784-euclidean': fashion_mnist,
'gist-960-euclidean': gist,
'glove-25-angular': lambda out_fn: glove(out_fn, 25),
'glove-50-angular': lambda out_fn: glove(out_fn, 50),
'glove-100-angular': lambda out_fn: glove(out_fn, 100),
'glove-200-angular': lambda out_fn: glove(out_fn, 200),
'mnist-784-euclidean': mnist,
'random-xs-20-euclidean': lambda out_fn: random_float(out_fn, 20, 10000, 100,
'euclidean'),
'random-s-100-euclidean': lambda out_fn: random_float(out_fn, 100, 100000, 1000,
'euclidean'),
'random-xs-20-angular': lambda out_fn: random_float(out_fn, 20, 10000, 100,
'angular'),
'random-s-100-angular': lambda out_fn: random_float(out_fn, 100, 100000, 1000,
'angular'),
'random-xs-16-hamming': lambda out_fn: random_bitstring(out_fn, 16, 10000,
100),
'random-s-128-hamming': lambda out_fn: random_bitstring(out_fn, 128,
50000, 1000),
'random-l-256-hamming': lambda out_fn: random_bitstring(out_fn, 256,
100000, 1000),
'random-s-jaccard': lambda out_fn: random_jaccard(out_fn, n=10000,
size=20, universe=40),
'random-l-jaccard': lambda out_fn: random_jaccard(out_fn, n=100000,
size=70, universe=100),
'sift-128-euclidean': sift,
'nytimes-256-angular': lambda out_fn: nytimes(out_fn, 256),
'nytimes-16-angular': lambda out_fn: nytimes(out_fn, 16),
'word2bits-800-hamming': lambda out_fn: word2bits(
out_fn, '400K',
'w2b_bitlevel1_size800_vocab400K'),
'lastfm-64-dot': lambda out_fn: lastfm(out_fn, 64),
'sift-256-hamming': lambda out_fn: sift_hamming(
out_fn, 'sift.hamming.256'),
'kosarak-jaccard': lambda out_fn: kosarak(out_fn),
'movielens1m-jaccard': movielens1m,
'movielens10m-jaccard': movielens10m,
'movielens20m-jaccard': movielens20m,
}
For another example of creating random vectors, see https://github.com/milvus-io/bootcamp/blob/master/solutions/image/face_recognition_system/quick_deploy/server/src/example.py to use with Milvus.
Methods: