blobsets/src/blobsets.nim

595 lines
17 KiB
Nim
Raw Normal View History

2018-12-21 03:50:36 +01:00
import std/hashes, std/streams, std/strutils, std/bitops, std/unicode, std/endians
2019-01-16 22:48:54 +01:00
import cbor, siphash
2018-12-21 03:50:36 +01:00
import ./blobsets/priv/hex
2018-12-24 21:19:03 +01:00
import std/streams, std/strutils
2018-12-21 03:50:36 +01:00
import nimcrypto, nimcrypto/blake2
const
digestLen* = 32
## Length of a chunk digest.
cidSize* = digestLen
## Size of CID object in memory
blobLeafSize* = 1 shl 14
## Size of blob leaves.
blobLeafSizeMask* = not(not(0) shl 14)
2018-12-27 19:22:59 +01:00
blobHexLen* = 32 * 2
2018-12-23 03:23:10 +01:00
blobVisualLen* = 32 * 3
2018-12-21 03:50:36 +01:00
maxChunkSize* {.deprecated} = blobLeafSize
type
Blake2b256* = Blake2bContext[256]
BlobId* = MDigest[Blake2b256.bits]
## Blob Identifier
SetId* = MDigest[Blake2b256.bits]
## Set Identifier
Cid* {.deprecated} = BlobId
func `$`*(bh: BlobId): string =
## Convert a blob hash to a visual representation.
const baseRune = 0x2800
2018-12-23 03:23:10 +01:00
result = newString(blobVisualLen)
2018-12-21 03:50:36 +01:00
var pos = 0
for b in bh.data.items:
let r = (Rune)baseRune or b.int
fastToUTF8Copy(r, result, pos, true)
2018-12-27 01:32:59 +01:00
func parseStringId[T](s: string): T =
2018-12-27 19:22:59 +01:00
case s.len
of blobHexLen:
hex.decode s, result.data
of blobVisualLen:
2018-12-21 03:50:36 +01:00
var
pos: int
r: Rune
for b in result.data.mitems:
fastRuneAt(s, pos, r, true)
b = r.byte
2018-12-27 19:22:59 +01:00
else:
raise newException(ValueError, "invalid blobset id encoding")
2018-12-21 03:50:36 +01:00
2018-12-27 01:32:59 +01:00
func parseCborId[T](c: CborNode): T =
## Parse a CBOR node to binary.
if c.bytes.len == result.data.len:
for i in 0..result.data.high:
result.data[i] = c.bytes[i]
func toBlobId*(s: string): BlobId =
## Parse a visual blob hash to binary.
parseStringId[BlobId] s
func toBlobId(c: CborNode): BlobId =
## Parse a CBOR blob hash to binary.
parseCborId[BlobId] c
func toSetId*(s: string): SetId =
## Parse a visual set hash to binary.
parseStringId[SetId] s
func toSetId(c: CborNode): SetId =
## Parse a CBOR set hash to binary.
parseCborId[SetId] c
2018-12-21 03:50:36 +01:00
proc `==`*(x, y: BlobId): bool = x.data == y.data
## Compare two BlobIds.
proc `==`*(cbor: CborNode; cid: BlobId): bool =
## Compare a CBOR node with a BlobId.
if cbor.kind == cborBytes:
for i in 0..<digestLen:
if cid.data[i] != cbor.bytes[i].uint8:
return false
result = true
proc hash*(cid: BlobId): Hash =
## Reduce a BlobId into an integer for use in tables.
var zeroKey: Key
result = cast[Hash](sipHash(cid.data, zeroKey))
proc toCbor*(cid: BlobId): CborNode = newCborBytes cid.data
## Generate a CBOR representation of a BlobId.
proc toBlobId*(cbor: CborNode): BlobId =
## Generate a CBOR representation of a BlobId.
assert(cbor.bytes.len == digestLen)
for i in 0..<digestLen:
result.data[i] = cbor.bytes[i].uint8
{.deprecated: [newCborBytes: toCbor].}
2018-12-27 19:22:59 +01:00
proc toHex*(id: BlobId|SetId): string = hex.encode(id.data)
2018-12-21 03:50:36 +01:00
## Return BlobId encoded in hexidecimal.
const
zeroChunk* = "8ddb61928ec76e4ee904cd79ed977ab6f5d9187f1102975060a6ba6ce10e5481".toDigest
## BlobId of zero chunk of maximum size.
proc take*(cid: var BlobId; buf: var string) =
## Take a raw digest from a string buffer.
doAssert(buf.len == digestLen)
copyMem(cid.data[0].addr, buf[0].addr, digestLen)
proc dagHash*(buf: pointer; len: Natural): BlobId =
## Generate a BlobId for a string of data using the BLAKE2b hash algorithm.
assert(len <= maxChunkSize)
var b: Blake2b256
init(b)
update(b, buf, len)
b.finish()
proc dagHash*(data: string): BlobId =
## Generate a BlobId for a string of data using the BLAKE2b hash algorithm.
assert(data.len <= maxChunkSize)
var b: Blake2b256
init(b)
update(b, data)
b.finish()
proc verify*(cid: BlobId; data: string): bool =
## Verify that a string of data corresponds to a BlobId.
var b: Blake2b256
init(b)
update(b, data)
finish(b) == cid
iterator simpleChunks*(s: Stream; size = maxChunkSize): string =
## Iterator that breaks a stream into simple chunks.
doAssert(size <= maxChunkSize)
var tmp = newString(size)
while not s.atEnd:
tmp.setLen(size)
tmp.setLen(s.readData(tmp[0].addr, size))
yield tmp
func isNonZero*(bh: BlobId): bool =
## Test if a blob hash is not zeroed.
var r: byte
for b in bh.data.items:
{.unroll.}
r = r or b
r != 0
{.deprecated: [isValid: isNonZero].}
type
Key = int64
const
keyBits = sizeof(Key) shl 3
keyChunkBits = fastLog2 keyBits
keyChunkMask = not ((not 0.Key) shl (keyChunkBits))
2018-12-23 03:23:10 +01:00
func toKey*(s: string): Key =
2018-12-21 03:50:36 +01:00
var key: siphash.Key
let b = sipHash(toOpenArrayByte(s, s.low, s.high), key)
cast[Key](b)
func toCbor(k: Key): CborNode =
## Keys are endian independent.
newCborBytes cast[array[sizeof(k), byte]](k)
2018-12-27 01:32:59 +01:00
const
# CBOR tags
nodeTag = 0
leafTag = 1
2018-12-21 03:50:36 +01:00
type
2018-12-27 01:32:59 +01:00
SetKind* = enum hotNode, coldNode, leafNode
2018-12-21 03:50:36 +01:00
BlobSet* = ref BlobSetObj
BlobSetObj = object
2018-12-27 01:32:59 +01:00
case kind*: SetKind
2018-12-21 03:50:36 +01:00
of hotNode:
bitmap: Key
2018-12-27 01:32:59 +01:00
table: seq[BlobSet]
2018-12-21 03:50:36 +01:00
of coldNode:
setId*: SetId
of leafNode:
key: Key
blob: BlobId
size: BiggestInt
func newBlobSet*(): BlobSet =
BlobSet(kind: hotNode, table: newSeqOfCap[BlobSet](2))
func sparseIndex(x: Key): int = int(x and keyChunkMask)
func compactIndex(t: BlobSet; x: Key): int =
if (x and keyChunkMask) != 0:
# TODO: bug in shr and shl, cannot shift all bits out
result = (int)countSetBits(t.bitmap shl (keyBits - x.sparseIndex))
func masked(t: BlobSet; x: Key): bool =
((t.bitmap shr x.sparseIndex) and 1) != 0
2018-12-23 08:23:21 +01:00
func isEmpty*(s: BlobSet): bool = s.bitmap == Key(0)
## Test if a set is empty.
2018-12-21 03:50:36 +01:00
func nodeCount*(bs: BlobSet): int =
## Count of internal nodes in set.
result = 1
for n in bs.table:
assert(n.kind != coldNode, "cannot count cold nodes")
if n.kind == hotNode:
result.inc n.nodeCount
func leafCount*(bs: BlobSet): int =
## Count of leaves in set.
for n in bs.table:
assert(n.kind != coldNode, "cannot count leaves of cold nodes")
if n.kind == leafNode:
result.inc 1
else:
result.inc n.leafCount
func search*(t: BlobSet; name: string): BlobId =
var
t = t
key = name.toKey
while true:
assert(key != 0, "keyspace exhausted during search")
if t.masked(key):
t = t.table[t.compactIndex(key)]
if t.kind == leafNode:
result = t.blob
break
key = key shr keyChunkBits
else:
raise newException(KeyError, "blob set does not contain key")
2018-12-24 21:19:03 +01:00
func apply(bs: BlobSet; cb: proc (leaf: BlobSet)) =
## Apply a callback to each set element.
for node in bs.table:
if node.isNil:
raiseAssert(bs.table.repr)
case node.kind
of hotNode:
apply(node, cb)
of leafNode:
cb(node)
else:
raiseAssert("cannot apply to node type " & $node.kind)
func apply*(t: BlobSet; name: string; f: proc (id: BlobId; size: BiggestInt)) =
## Apply a procedure to a named blob, if it is present
var
t = t
key = name.toKey
while true:
assert(key != 0, "keyspace exhausted during search")
if t.masked(key):
t = t.table[t.compactIndex(key)]
if t.kind == leafNode:
f(t.blob, t.size)
break
key = key shr keyChunkBits
else:
break
func contains*(bs: BlobSet; name: string): bool =
var found = false
apply(bs, name) do (id: BlobId; size: BiggestInt):
found = true
result = found
func insert(trie, l: BlobSet; depth: int): BlobSet =
2018-12-21 03:50:36 +01:00
## This procedure is recursive to a depth of keyBits/keyChunkBits.
2018-12-24 21:19:03 +01:00
# TODO: not functional?
2018-12-21 03:50:36 +01:00
doAssert(depth < (keyBits div keyChunkBits), "key space exhausted during insert")
2018-12-24 21:19:03 +01:00
result = BlobSet(kind: hotNode, bitmap: trie.bitmap, table: trie.table)
2018-12-21 03:50:36 +01:00
let key = l.key shr (depth * keyChunkBits)
2018-12-24 21:19:03 +01:00
if result.masked(key):
2018-12-21 03:50:36 +01:00
let
depth = depth + 1
2018-12-24 21:19:03 +01:00
i = result.compactIndex(key)
case result.table[i].kind
2018-12-21 03:50:36 +01:00
of hotNode:
2018-12-24 21:19:03 +01:00
result.table[i] = insert(result.table[i], l, depth)
2018-12-21 03:50:36 +01:00
of coldNode:
raiseAssert("cannot insert into cold node")
of leafNode:
2018-12-24 21:19:03 +01:00
if result.table[i].key == l.key:
2018-12-21 03:50:36 +01:00
raise newException(KeyError, "key collision in blob set")
2018-12-24 21:19:03 +01:00
var subtrie = newBlobSet()
subtrie = subtrie.insert(result.table[i], depth)
subtrie = subtrie.insert(l, depth)
result.table[i] = subtrie
2018-12-21 03:50:36 +01:00
else:
2018-12-24 21:19:03 +01:00
result.bitmap = result.bitmap or (Key(1) shl key.sparseIndex)
result.table.insert(l, result.compactIndex(key))
2018-12-21 03:50:36 +01:00
2018-12-24 21:19:03 +01:00
func insert*(trie, node: BlobSet): BlobSet = insert(trie, node, 0)
## Insert set node `node` into `trie`.
2018-12-23 08:23:21 +01:00
2018-12-24 21:19:03 +01:00
func insert*(t: BlobSet; name: string; blob: BlobId; size: BiggestInt): BlobSet =
2018-12-21 03:50:36 +01:00
## Insert a blob hash into a trie.
2018-12-24 21:19:03 +01:00
# TODO: this is not functional!
2018-12-21 03:50:36 +01:00
let leaf = BlobSet(kind: leafNode, key: name.toKey, blob: blob, size: size)
2018-12-23 08:23:21 +01:00
insert(t, leaf)
2018-12-21 03:50:36 +01:00
2018-12-24 21:19:03 +01:00
func remove(trie: BlobSet; key: Key; depth: int): BlobSet =
result = trie
let key = key shr (depth * keyChunkBits)
if trie.masked(key):
let
depth = depth + 1
i = trie.compactIndex(key)
case trie.table[i].kind
of hotNode:
let newTrie = remove(trie.table[i], key, depth)
if newTrie != trie.table[i]:
if newTrie.isNil:
if trie.table.len == 1:
result = nil
else:
result = newBlobSet()
for j in trie.table.low..trie.table.high:
if j == i: continue
result = insert(result, newTrie, depth)
of coldNode:
raiseAssert("cannot remove from cold node")
of leafNode:
if trie.table.len == 1:
result = nil
func remove*(trie: BlobSet; name: string): BlobSet =
## Remove a blob from a trie.
if trie.isEmpty:
result = trie
else:
let key = name.toKey
result = remove(trie, key, 0)
if result.isNil:
result = newBlobSet()
2018-12-21 03:50:36 +01:00
func toCbor*(x: BlobSet): CborNode =
case x.kind
of hotNode:
2018-12-27 01:32:59 +01:00
let array = newCborArray()
array.add x.bitmap
2018-12-21 03:50:36 +01:00
for y in x.table:
array.add y.toCbor
newCborTag(nodeTag, array)
of coldNode:
2018-12-27 01:32:59 +01:00
newCborTag(nodeTag, x.setId.data.newCborBytes)
2018-12-21 03:50:36 +01:00
of leafNode:
2018-12-27 01:32:59 +01:00
let array = newCborArray()
array.add x.key
2018-12-21 03:50:36 +01:00
array.add x.blob.data
array.add x.size
newCborTag(leafTag, array)
2018-12-24 21:19:03 +01:00
func leafCount*(size: Natural): int = (size+blobLeafSize-1) div blobLeafSize
2018-12-27 01:32:59 +01:00
func compressTree*(leaves: var seq[BlobId]) =
var
ctx: Blake2b256
nodeOffset = 0
nodeDepth = 0
while leaves.len > 1:
nodeOffset = 0
inc nodeDepth
var pos, next: int
while pos < leaves.len:
ctx.init do (params: var Blake2bParams):
params.fanout = 2
params.depth = 255
params.leafLength = blobLeafSize
params.nodeOffset = nodeOffset
params.nodeDepth = nodeDepth
inc nodeOffset
ctx.update(leaves[pos].data)
inc pos
if pos < leaves.len:
ctx.update(leaves[pos].data)
inc pos
leaves[next] = ctx.finish()
inc next
leaves.setLen(next)
# TODO: BLAKE2 tree finalization flags
2018-12-24 21:19:03 +01:00
type
BlobKind* = enum
dataBlob, metaBlob
BlobStream* = ref BlobStreamObj
BlobStreamObj* = object of RootObj
closeImpl*: proc (s: BlobStream) {.nimcall, gcsafe.}
readImpl*: proc (s: BlobStream; buffer: pointer; bufLen: int): int {.nimcall, gcsafe.}
IngestStream* = ref IngestStreamObj
IngestStreamObj* = object of RootObj
finishImpl*: proc (s: IngestStream): tuple[id: BlobId, size: BiggestInt] {.nimcall, gcsafe.}
ingestImpl*: proc (s: IngestStream; buf: pointer; size: int) {.nimcall, gcsafe.}
proc close*(s: BlobStream) =
assert(not s.closeImpl.isNil)
s.closeImpl(s)
proc read*(s: BlobStream; buf: pointer; len: Natural): int =
assert(not s.readImpl.isNil)
result = s.readImpl(s, buf, len)
proc finish*(s: IngestStream): tuple[id: BlobId, size: BiggestInt] =
## Finish ingest stream
assert(not s.finishImpl.isNil)
s.finishImpl(s)
proc ingest*(s: IngestStream; buf: pointer; size: Natural) =
## Ingest stream
assert(not s.ingestImpl.isNil)
s.ingestImpl(s, buf, size)
proc ingest*(s: IngestStream; buf: var string) =
## Ingest stream
assert(not s.ingestImpl.isNil)
s.ingestImpl(s, buf[0].addr, buf.len)
type
BlobStore* = ref BlobStoreObj
BlobStoreObj* = object of RootObj
closeImpl*: proc (s: BlobStore) {.nimcall, gcsafe.}
openBlobStreamImpl*: proc (s: BlobStore; id: BlobId; size: BiggestInt; kind: BlobKind): BlobStream {.nimcall, gcsafe.}
openIngestStreamImpl*: proc (s: BlobStore; size: BiggestInt; kind: BlobKind): IngestStream {.nimcall, gcsafe.}
2018-12-27 01:32:59 +01:00
#
# Null Store implementation
#
type
NullIngestStream = ref NullIngestStreamObj
NullIngestStreamObj = object of IngestStreamObj
ctx: Blake2b256
leaves: seq[BlobId]
pos, nodeOffset: BiggestInt
proc nullBlobClose(s: BlobStream) = discard
proc nullBlobRead(s: BlobStream; buffer: pointer; len: Natural): int = 0
proc nullOpenBlobStream(s: BlobStore; id: BlobId; size: BiggestInt; kind: BlobKind): BlobStream =
BlobStream(closeImpl: nullBlobClose, readImpl: nullBlobRead)
proc nullFinish(s: IngestStream): tuple[id: BlobId, size: BiggestInt] =
var s = NullIngestStream(s)
s.leaves.add finish(s.ctx)
compressTree(s.leaves)
result.id = s.leaves[0]
result.size = s.pos
proc nullIngest(s: IngestStream; buf: pointer; len: Natural) =
var
s = NullIngestStream(s)
off = 0
buf = cast[ptr array[blobLeafSize, byte]](buf)
while off < len:
var n = min(blobLeafSize, len-off)
let leafOff = int(s.pos and blobLeafSizeMask)
if leafOff == 0:
if s.pos > 0:
s.leaves.add finish(s.ctx)
s.ctx.init do (params: var Blake2bParams):
params.fanout = 2
params.depth = 255
params.leafLength = blobLeafSize
params.nodeOffset = s.nodeOffset
inc s.nodeOffset
else:
n = min(n, blobLeafSize-leafOff)
s.ctx.update(buf[off].addr, n)
off.inc n
s.pos.inc n
proc nullOpenIngestStream(s: BlobStore; size: BiggestInt; kind: BlobKind): IngestStream =
NullIngestStream(
finishImpl: nullFinish, ingestImpl: nullIngest, leaves: newSeq[BlobId]())
proc newNullStore*(): BlobStore =
BlobStore(
openBlobStreamImpl: nullOpenBlobStream,
openIngestStreamImpl: nullOpenIngestStream)
2018-12-24 21:19:03 +01:00
proc close*(s: BlobStore) =
## Close active store resources.
if not s.closeImpl.isNil: s.closeImpl(s)
proc openBlobStream*(s: BlobStore; id: BlobId; size = 0.BiggestInt; kind = dataBlob): BlobStream =
## Return a new `BlobStream` for reading a blob.
assert(not s.openBlobStreamImpl.isNil)
s.openBlobStreamImpl(s, id, size, kind)
proc openIngestStream*(s: BlobStore; size = 0.BiggestInt; kind = dataBlob): IngestStream =
## Return a new `IngestStream` for ingesting a blob.
assert(not s.openIngestStreamImpl.isNil)
s.openIngestStreamImpl(s, size, kind)
iterator dumpBlob*(store: BlobStore; id: BlobId): string =
var
stream = store.openBlobStream(id, kind=dataBlob)
buf = newString(blobLeafSize)
defer:
close stream
while true:
buf.setLen(blobLeafSize)
let n = stream.read(buf[0].addr, buf.len)
if n == 0:
break
buf.setLen(n)
yield buf
2018-12-27 01:32:59 +01:00
proc loadSet(store: BlobStore; id: SetId; depth: int): BlobSet =
if Key.high shr depth == 0:
raiseAssert("loadSet trie is too deep")
var
stream = store.openBlobStream(id, kind=metaBlob)
buf = newString(blobLeafSize)
defer:
close stream
let n = stream.read(buf[0].addr, buf.len)
buf.setLen(n)
let
c = buf.parseCbor.val
bitmap = c.seq[0].getInt
if bitmap.countSetBits != c.seq.len-1:
let bits = bitmap.countSetBits
raise newException(ValueError, "invalid set CBOR, bitmap is " & $bits & " and sequence len is " & $c.seq.len)
result = BlobSet(
kind: hotNode,
bitmap: bitmap,
table: newSeqOfCap[BlobSet](c.seq.len-1))
for i in 1..c.seq.high:
let node = c[i].val
case c[i].tag.int
of nodeTag:
result.table.add loadSet(store, node.toSetId, depth+1)
of leafTag:
let
leaf = BlobSet(
kind: leafNode,
key: getInt node[0],
blob: parseCborId[BlobId] node[1],
size: getInt node[2])
result.table.add leaf
else:
raise newException(ValueError, "invalid set CBOR")
proc loadSet*(store: BlobStore; id: SetId): BlobSet =
loadSet store, id, 0
2018-12-24 21:19:03 +01:00
proc commit*(store: BlobStore; bs: BlobSet): BlobSet =
assert(bs.kind == hotNode)
for e in bs.table.mitems:
case e.kind
of coldNode, leafNode: discard
of hotNode:
e = store.commit e
let stream = store.openIngestStream(kind=metaBlob)
var buf = encode bs.toCbor
stream.ingest(buf)
let (id, _) = finish stream
result = BlobSet(kind: coldNode, setId: id)
proc apply*(store: BlobStore; bs: BlobSet; name: string; f: proc (id: BlobId; size: BiggestInt)) =
# TODO: lazy-load set
bs.apply(name, f)
proc insert*(store: BlobStore; bs: BlobSet; name: string; blob: BlobId; size: BiggestInt): BlobSet =
# TODO: lazy-load set
insert(bs, name, blob, size)
proc remove*(store: BlobStore; bs: BlobSet; name: string): BlobSet =
# TODO: lazy-load set
remove(bs, name)
proc union*(store: BlobStore; sets: varargs[BlobSet]): BlobSet =
## Return the union of `sets`.
# TODO: lazy-load set
var fresh = newBlobSet()
proc freshInsert(leaf: BlobSet) =
fresh = insert(fresh, leaf)
for bs in sets:
assert(not bs.isnil)
bs.apply(freshInsert)
result = fresh