blobsets/src/blobsets.nim

import std/asyncdispatch
import std/hashes, std/streams, std/strutils, std/bitops, std/unicode, std/endians
import cbor, siphash
import ./blobsets/priv/hex
import std/streams, std/strutils

import nimcrypto, nimcrypto/blake2

const
  digestLen* = 32
    ## Length of a chunk digest.
  cidSize* = digestLen
    ## Size of CID object in memory
  blobLeafSize* = 1 shl 14
    ## Size of blob leaves.
  blobLeafSizeMask* = not(not(0) shl 14)
  blobHexLen* = 32 * 2
  blobVisualLen* = 32 * 3

  maxChunkSize* {.deprecated} = blobLeafSize

type
  Blake2b256* = Blake2bContext[256]

  BlobId* = MDigest[Blake2b256.bits]
    ## Blob Identifier
  SetId* = MDigest[Blake2b256.bits]
    ## Set Identifier

  Cid* {.deprecated} = BlobId

func `$`*(bh: BlobId): string =
  ## Convert a blob hash to a visual representation.
  const baseRune = 0x2800
  result = newString(blobVisualLen)
  var pos = 0
  for b in bh.data.items:
    let r = (Rune)baseRune or b.int
    fastToUTF8Copy(r, result, pos, true)

func parseStringId[T](s: string): T =
  case s.len
  of blobHexLen:
    hex.decode s, result.data
  of blobVisualLen:
    var
      pos: int
      r: Rune
    for b in result.data.mitems:
      fastRuneAt(s, pos, r, true)
      b = r.byte
  else:
    raise newException(ValueError, "invalid blobset id encoding")

func parseCborId[T](c: CborNode): T =
  ## Parse a CBOR node to binary.
  if c.bytes.len == result.data.len:
    for i in 0..result.data.high:
      result.data[i] = c.bytes[i]

func toBlobId*(s: string): BlobId =
  ## Parse a visual blob hash to binary.
  parseStringId[BlobId] s

func toBlobId(c: CborNode): BlobId =
  ## Parse a CBOR blob hash to binary.
  parseCborId[BlobId] c

func toSetId*(s: string): SetId =
  ## Parse a visual set hash to binary.
  parseStringId[SetId] s

func toSetId(c: CborNode): SetId =
  ## Parse a CBOR set hash to binary.
  parseCborId[SetId] c

proc `==`*(x, y: BlobId): bool = x.data == y.data
  ## Compare two BlobIds.

proc `==`*(cbor: CborNode; cid: BlobId): bool =
  ## Compare a CBOR node with a BlobId.
  if cbor.kind == cborBytes:
    for i in 0..<digestLen:
      if cid.data[i] != cbor.bytes[i].uint8:
        return false
    result = true

proc hash*(cid: BlobId): Hash =
  ## Reduce a BlobId into an integer for use in tables.
  var zeroKey: Key
  result = cast[Hash](sipHash(cid.data, zeroKey))

proc toCbor*(cid: BlobId): CborNode = newCborBytes cid.data
  ## Generate a CBOR representation of a BlobId.

proc toBlobId*(cbor: CborNode): BlobId =
  ## Generate a CBOR representation of a BlobId.
  assert(cbor.bytes.len == digestLen)
  for i in 0..<digestLen:
    result.data[i] = cbor.bytes[i].uint8

{.deprecated: [newCborBytes: toCbor].}

proc toHex*(id: BlobId|SetId): string = hex.encode(id.data)
  ## Return BlobId encoded in hexidecimal.

const
  zeroChunk* = "8ddb61928ec76e4ee904cd79ed977ab6f5d9187f1102975060a6ba6ce10e5481".toDigest
    ## BlobId of zero chunk of maximum size.

proc take*(cid: var BlobId; buf: var string) =
  ## Take a raw digest from a string buffer.
  doAssert(buf.len == digestLen)
  copyMem(cid.data[0].addr, buf[0].addr, digestLen)

proc blobHash*(buf: pointer; len: Natural): BlobId =
  ## Generate a BlobId for a string of data using the BLAKE2b hash algorithm.
  assert(len <= maxChunkSize)
  var b: Blake2b256
  init(b)
  update(b, buf, len)
  b.finish()

proc blobHash*(data: string): BlobId =
  ## Generate a BlobId for a string of data using the BLAKE2b hash algorithm.
  assert(data.len <= maxChunkSize)
  var b: Blake2b256
  init(b)
  update(b, data)
  b.finish()

proc verify*(cid: BlobId; data: string): bool =
  ## Verify that a string of data corresponds to a BlobId.
  var b: Blake2b256
  init(b)
  update(b, data)
  finish(b) == cid

iterator simpleChunks*(s: Stream; size = maxChunkSize): string =
  ## Iterator that breaks a stream into simple chunks.
  doAssert(size <= maxChunkSize)
  var tmp = newString(size)
  while not s.atEnd:
    tmp.setLen(size)
    tmp.setLen(s.readData(tmp[0].addr, size))
    yield tmp

func isNonZero*(bh: BlobId): bool =
  ## Test if a blob hash is not zeroed.
  var r: byte
  for b in bh.data.items:
    {.unroll.}
    r = r or b
  r != 0

{.deprecated: [isValid: isNonZero].}

type
  Key* = uint64

const
  keyBits = sizeof(Key) shl 3
  keyChunkBits = fastLog2 keyBits
  keyChunkMask = not ((not 0.Key) shl (keyChunkBits))

func toKey*(s: string): Key =
  var key: siphash.Key
  let b = sipHash(toOpenArrayByte(s, s.low, s.high), key)
  cast[Key](b)

func toCbor(k: Key): CborNode =
  ## Keys are endian independent.
  newCborBytes cast[array[sizeof(k), byte]](k)

const
  # CBOR tags
  nodeTag = 0
  leafTag = 1

type
  SetKind* = enum hotNode, coldNode, leafNode
  BlobSet* = ref BlobSetObj
  BlobSetObj = object
    case kind*: SetKind
    of hotNode:
      bitmap: uint64
      table: seq[BlobSet]
    of coldNode:
      setId*: SetId
    of leafNode:
      key: Key
      blob*: BlobId
      size: BiggestInt

func toCbor*(x: BlobSet): CborNode =
  case x.kind
  of hotNode:
    let array = newCborArray()
    let bitmap = newCborInt(x.bitmap)
    assert(bitmap.getInt.uint64 == x.bitmap, $bitmap.getInt.uint64 & " != " & $x.bitmap)
    array.add bitmap
    for y in x.table:
      array.add y.toCbor
    newCborTag(nodeTag, array)
  of coldNode:
    newCborTag(nodeTag, x.setId.data.newCborBytes)
  of leafNode:
    let array = newCborArray()
    array.add x.key
    array.add x.blob.data
    array.add x.size
    newCborTag(leafTag, array)

func newBlobSet*(): BlobSet =
  ## Create a new hot blob set.
  BlobSet(kind: hotNode, table: newSeqOfCap[BlobSet](2))

func newBlobSet*(id: SetId): BlobSet =
  ## Create a new cold blob set.
  BlobSet(kind: coldNode, setId: id)

func isHot*(bs: BlobSet): bool = bs.kind == hotNode

func sparseIndex(x: Key): int = int(x and keyChunkMask)

func compactIndex(t: BlobSet; x: Key): int =
  if (x and keyChunkMask) != 0:
    # TODO: bug in shr and shl, cannot shift all bits out
    result = (int)countSetBits(t.bitmap shl (keyBits - x.sparseIndex))

func masked(t: BlobSet; x: Key): bool =
  ((t.bitmap shr x.sparseIndex) and 1) != 0

func isEmpty*(s: BlobSet): bool = s.bitmap == 0'u64
  ## Test if a set is empty.

func nodeCount*(bs: BlobSet): int =
  ## Count of internal nodes in set.
  result = 1
  for n in bs.table:
    assert(n.kind != coldNode, "cannot count cold nodes")
    if n.kind == hotNode:
      result.inc n.nodeCount

func leafCount*(bs: BlobSet): int =
  ## Count of leaves in set.
  for n in bs.table:
    assert(n.kind != coldNode, "cannot count leaves of cold nodes")
    if n.kind == leafNode:
      result.inc 1
    else:
      result.inc n.leafCount

func search*(trie: BlobSet; name: string): BlobId =
  let key = name.toKey
  var
    n = trie
    k = key
    level = 0
  while k != 0 and n.masked(k):
    n = n.table[n.compactIndex(k)]
    if n.kind == leafNode:
      if n.key == key:
        return n.blob
      break
    k = k shr keyChunkBits
    inc level
  raise newException(KeyError, "key not in blob set")

func apply(bs: BlobSet; cb: proc (leaf: BlobSet)) =
  ## Apply a callback to each set element.
  for node in bs.table:
    if node.isNil:
      raiseAssert(bs.table.repr)
    case node.kind
    of hotNode:
      apply(node, cb)
    of leafNode:
      cb(node)
    else:
      raiseAssert("cannot apply to node type " & $node.kind)

func apply*(trie: BlobSet; name: string; f: proc (id: BlobId; size: BiggestInt)) =
  ## Apply a procedure to a named blob, if it is present
  let key = name.toKey
  var
    n = trie
    k = key
  while k != 0 and n.masked(k):
    n = n.table[n.compactIndex(k)]
    if n.kind == leafNode:
      if n.key == key:
        f(n.blob, n.size)
      break
    k = k shr keyChunkBits

func contains*(bs: BlobSet; name: string): bool =
  var found = false
  apply(bs, name) do (id: BlobId; size: BiggestInt):
    found = true
  result = found

func insert(trie, l: BlobSet; depth: int): BlobSet =
  ## This procedure is recursive to a depth of keyBits/keyChunkBits.
  # TODO: not functional?
  doAssert(depth < (keyBits div keyChunkBits), "key space exhausted during insert")
  result = BlobSet(kind: hotNode, bitmap: trie.bitmap, table: trie.table)
  let key = l.key shr (depth * keyChunkBits)
  if result.masked(key):
    let
      depth = depth + 1
      i = result.compactIndex(key)
    case result.table[i].kind
    of hotNode:
      result.table[i] = insert(result.table[i], l, depth)
    of coldNode:
      raiseAssert("cannot insert into cold node")
    of leafNode:
      if result.table[i].key == l.key:
        result.table[i] = l
      else:
        var subtrie = newBlobSet()
        subtrie = subtrie.insert(result.table[i], depth)
        subtrie = subtrie.insert(l, depth)
        result.table[i] = subtrie
  else:
    result.bitmap = result.bitmap or (1'u64 shl key.sparseIndex)
    result.table.insert(l, result.compactIndex(key))

func insert*(trie, node: BlobSet): BlobSet = insert(trie, node, 0)
  ## Insert set node `node` into `trie`.

func insert*(t: BlobSet; name: string; blob: BlobId; size: BiggestInt): BlobSet =
  ## Insert a blob hash into a trie.
  # TODO: this is not functional!
  let leaf = BlobSet(kind: leafNode, key: name.toKey, blob: blob, size: size)
  insert(t, leaf)

func remove(trie: BlobSet; key: Key; depth: int): BlobSet =
  result = trie
  let key = key shr (depth * keyChunkBits)
  if trie.masked(key):
    let
      depth = depth + 1
      i = trie.compactIndex(key)
    case trie.table[i].kind
    of hotNode:
      let newTrie = remove(trie.table[i], key, depth)
      if newTrie != trie.table[i]:
        if newTrie.isNil:
          if trie.table.len == 1:
            result = nil
        else:
          result = newBlobSet()
          for j in trie.table.low..trie.table.high:
            if j == i: continue
            result = insert(result, newTrie, depth)
    of coldNode:
      raiseAssert("cannot remove from cold node")
    of leafNode:
      if trie.table.len == 1:
        result = nil

func remove*(trie: BlobSet; name: string): BlobSet =
  ## Remove a blob from a trie.
  if trie.isEmpty:
    result = trie
  else:
    let key = name.toKey
    result = remove(trie, key, 0)
    if result.isNil:
      result = newBlobSet()

func leafCount*(size: Natural): int = (size+blobLeafSize-1) div blobLeafSize

func compressTree*(leaves: var seq[BlobId]) =
  var
    ctx: Blake2b256
    nodeOffset = 0
    nodeDepth = 0
  while leaves.len > 1:
    nodeOffset = 0
    inc nodeDepth
    var pos, next: int
    while pos < leaves.len:
      ctx.init do (params: var Blake2bParams):
        params.fanout = 2
        params.depth = 255
        params.leafLength = blobLeafSize
        params.nodeOffset = nodeOffset
        params.nodeDepth = nodeDepth
      inc nodeOffset
      ctx.update(leaves[pos].data)
      inc pos
      if pos < leaves.len:
        ctx.update(leaves[pos].data)
        inc pos
      leaves[next] = ctx.finish()
      inc next
    leaves.setLen(next)
  # TODO: BLAKE2 tree finalization flags

type
  BlobKind* = enum
    dataBlob, metaBlob

proc `$`*(k: BlobKind): string =
  case k
  of dataBlob: "data"
  of metaBlob: "meta"

type
  BlobStream* = ref BlobStreamObj
  BlobStreamObj* = object of RootObj
    closeImpl*: proc (s: BlobStream) {.nimcall, gcsafe.}
    sizeImpl*: proc (s: BlobStream): BiggestInt {.nimcall, gcsafe.}
    setPosImpl*: proc (s: BlobStream; pos: BiggestInt) {.nimcall, gcsafe.}
    getPosImpl*: proc (s: BlobStream): BiggestInt {.nimcall, gcsafe.}
    readImpl*: proc (s: BlobStream; buffer: pointer; bufLen: int): Future[int] {.nimcall, gcsafe.}
  IngestStream* = ref IngestStreamObj
  IngestStreamObj* = object of RootObj
    cancelImpl*: proc (s: IngestStream) {.nimcall, gcsafe.}
    finishImpl*: proc (s: IngestStream): Future[tuple[id: BlobId, size: BiggestInt]] {.nimcall, gcsafe.}
    ingestImpl*: proc (s: IngestStream; buf: pointer; size: int): Future[void] {.nimcall, gcsafe.}

proc close*(s: BlobStream) =
  assert(not s.closeImpl.isNil)
  s.closeImpl(s)

proc size*(s: BlobStream): BiggestInt =
  assert(not s.sizeImpl.isNil)
  s.sizeImpl(s)

proc `pos=`*(s: BlobStream; pos: BiggestInt) =
  assert(not s.setPosImpl.isNil)
  s.setPosImpl(s, pos)

proc pos*(s: BlobStream): BiggestInt =
  assert(not s.getPosImpl.isNil)
  s.getPosImpl(s)

proc read*(s: BlobStream; buf: pointer; len: Natural): Future[int] =
  assert(not s.readImpl.isNil)
  s.readImpl(s, buf, len)

proc cancle*(s: IngestStream): tuple[id: BlobId, size: BiggestInt] =
  ## Cancel and close ingest stream
  assert(not s.cancelImpl.isNil)
  s.cancelImpl(s)

proc finish*(s: IngestStream): Future[tuple[id: BlobId, size: BiggestInt]] =
  ## Finish ingest stream
  assert(not s.finishImpl.isNil)
  s.finishImpl(s)

proc ingest*(s: IngestStream; buf: pointer; size: Natural): Future[void] =
  ## Ingest stream
  assert(not s.ingestImpl.isNil)
  s.ingestImpl(s, buf, size)

proc ingest*(s: IngestStream; buf: string): Future[void] =
  ## Ingest stream
  assert(not s.ingestImpl.isNil)
  s.ingestImpl(s, buf[0].unsafeAddr, buf.len)

type
  BlobStore* = ref BlobStoreObj
  BlobStoreObj* = object of RootObj
    closeImpl*: proc (s: BlobStore) {.nimcall, gcsafe.}
    openBlobStreamImpl*: proc (s: BlobStore; id: BlobId; size: BiggestInt; kind: BlobKind): BlobStream {.nimcall, gcsafe.}
    openIngestStreamImpl*: proc (s: BlobStore; size: BiggestInt; kind: BlobKind): IngestStream {.nimcall, gcsafe.}

#
# Null Store implementation
#

type
  NullIngestStream = ref NullIngestStreamObj
  NullIngestStreamObj = object of IngestStreamObj
    ctx: Blake2b256
    leaves: seq[BlobId]
    pos, nodeOffset: BiggestInt

proc nullBlobClose(s: BlobStream) = discard

proc setPosNull(s: BlobStream; pos: BiggestInt) = discard
proc getPosNull(s: BlobStream): BiggestInt = discard

proc nullBlobRead(s: BlobStream; buffer: pointer; len: Natural): Future[int] =
  result = newFuture[int]()
  complete result, 0

proc nullOpenBlobStream(s: BlobStore; id: BlobId; size: BiggestInt; kind: BlobKind): BlobStream =
  BlobStream(
    closeImpl: nullBlobClose,
    setPosImpl: setPosNull,
    getPosImpl: getPosNull,
    readImpl: nullBlobRead)

proc nullFinish(s: IngestStream): Future[tuple[id: BlobId, size: BiggestInt]] =
  var s = NullIngestStream(s)
  s.leaves.add finish(s.ctx)
  compressTree(s.leaves)
  var pair: tuple[id: BlobId, size: BiggestInt]
  pair.id = s.leaves[0]
  pair.size = s.pos
  result = newFuture[tuple[id: BlobId, size: BiggestInt]]()
  complete result, pair

proc nullIngest(s: IngestStream; buf: pointer; len: Natural): Future[void] =
  var
    s = NullIngestStream(s)
    off = 0
    buf = cast[ptr array[blobLeafSize, byte]](buf)
  while off < len:
    var n = min(blobLeafSize, len-off)
    let leafOff = int(s.pos and blobLeafSizeMask)
    if leafOff == 0:
      if s.pos > 0:
        s.leaves.add finish(s.ctx)
      s.ctx.init do (params: var Blake2bParams):
        params.fanout = 2
        params.depth = 255
        params.leafLength = blobLeafSize
        params.nodeOffset = s.nodeOffset
        inc s.nodeOffset
    else:
      n = min(n, blobLeafSize-leafOff)
    s.ctx.update(buf[off].addr, n)
    off.inc n
    s.pos.inc n
  result = newFuture[void]()
  complete result

proc nullOpenIngestStream(s: BlobStore; size: BiggestInt; kind: BlobKind): IngestStream =
  NullIngestStream(
    finishImpl: nullFinish, ingestImpl: nullIngest, leaves: newSeq[BlobId]())

proc newNullStore*(): BlobStore =
  BlobStore(
    openBlobStreamImpl: nullOpenBlobStream,
    openIngestStreamImpl: nullOpenIngestStream)

proc close*(s: BlobStore) =
  ## Close active store resources.
  if not s.closeImpl.isNil: s.closeImpl(s)

proc openBlobStream*(s: BlobStore; id: BlobId; size = 0.BiggestInt; kind = dataBlob): BlobStream =
  ## Return a new `BlobStream` for reading a blob.
  assert(not s.openBlobStreamImpl.isNil)
  s.openBlobStreamImpl(s, id, size, kind)

proc openIngestStream*(s: BlobStore; size = 0.BiggestInt; kind = dataBlob): IngestStream =
  ## Return a new `IngestStream` for ingesting a blob.
  assert(not s.openIngestStreamImpl.isNil)
  s.openIngestStreamImpl(s, size, kind)

iterator dumpBlob*(store: BlobStore; id: BlobId): string =
  var
    stream = store.openBlobStream(id, kind=dataBlob)
    buf = newString(blobLeafSize)
  defer:
    close stream
  while true:
    buf.setLen(blobLeafSize)
    let n = waitFor stream.read(buf[0].addr, buf.len)
    if n == 0:
      break
    buf.setLen(n)
    yield buf

proc loadSet(store: BlobStore; id: SetId; depth: int): Future[BlobSet] {.async.} =
  if (not Key(0)) shr depth == 0:
    raiseAssert("loadSet trie is too deep")
  var
    stream = store.openBlobStream(id, kind=metaBlob)
    buf = newString(blobLeafSize)
  defer:
    close stream
  let n = await stream.read(buf[0].addr, buf.len)
  assert(n != 0, "read zero of set " & $id)
  buf.setLen(n)
  let
    tagPair = parseCbor buf
    c = tagPair.val
    bitmap = c.seq[0].getInt.uint64
  if bitmap.countSetBits != c.seq.len-1:
    let bits = bitmap.countSetBits
    raise newException(ValueError, "invalid set CBOR, bitmap has " & $bits & " bits and sequence len is " & $c.seq.len)
  result = BlobSet(
    kind: hotNode,
    bitmap: bitmap,
    table: newSeqOfCap[BlobSet](c.seq.len-1))
  for i in 1..c.seq.high:
    let node = c[i].val
    case c[i].tag.int
    of nodeTag:
      let child = await loadSet(store, node.toSetId, depth+1)
      result.table.add child
    of leafTag:
      let
        leaf = BlobSet(
          kind: leafNode,
          key: getNum[Key] node[0],
          blob: parseCborId[BlobId] node[1],
          size: getInt node[2])
      result.table.add leaf
    else:
      raise newException(ValueError, "invalid set CBOR")

proc load*(store: BlobStore; id: SetId): BlobSet =
  waitFor loadSet(store, id, 0)

proc commit*(store: BlobStore; bs: BlobSet): BlobSet =
  if bs.kind == coldNode: return bs
  let tmp = BlobSet(kind: hotNode, bitmap: bs.bitmap, table: bs.table)
  for e in tmp.table.mitems:
    if e.isHot: e = store.commit e
  let stream = store.openIngestStream(kind=metaBlob)
  var buf = encode tmp.toCbor
  waitFor stream.ingest(buf)
  let (id, _) = waitFor finish(stream)
  result = BlobSet(kind: coldNode, setId: id)

proc apply*(store: BlobStore; bs: BlobSet; name: string; f: proc (id: BlobId; size: BiggestInt)) =
  # TODO: lazy-load set
  doAssert(bs.kind == hotNode)
  apply(bs, name, f)

proc insert*(store: BlobStore; bs: BlobSet; name: string; blob: BlobId; size: BiggestInt): BlobSet =
  # TODO: lazy-load set
  insert(bs, name, blob, size)

proc remove*(store: BlobStore; bs: BlobSet; name: string): BlobSet =
  # TODO: lazy-load set
  remove(bs, name)

proc union*(store: BlobStore; sets: varargs[BlobSet]): BlobSet =
  ## Return the union of `sets`.
  # TODO: lazy-load set
  var fresh = newBlobSet()
  proc freshInsert(leaf: BlobSet) =
    fresh = insert(fresh, leaf)
  for bs in sets:
    assert(not bs.isnil)
    bs.apply(freshInsert)
  result = fresh

import random

proc randomApply*(store: BlobStore; trie: BlobSet; seed: int64;
                  f: proc(id: BlobId; size: BiggestInt)) =
  ## Apply to random leaf if the set is not empty.
  var
    rng = initRand(seed)
    retry = 0
    trie = trie
    i = rng.rand(max(1, countSetBits(trie.bitmap))-1)
  while trie.bitmap != 0:
    let next = trie.table[i]
    case next.kind
    of leafNode:
      f(next.blob, next.size)
      break
    of coldNode:
      trie.table[i] = store.load(next.setId)
    of hotNode:
      trie = next
      i = rng.rand(countSetBits(trie.bitmap)-1)