blobsets/src/blobsets.nim

import std/hashes, std/streams, std/strutils, std/bitops, std/unicode, std/endians
import base58/bitcoin, cbor, siphash
import ./blobsets/priv/hex

import nimcrypto, nimcrypto/blake2

const
  digestLen* = 32
    ## Length of a chunk digest.
  cidSize* = digestLen
    ## Size of CID object in memory
  blobLeafSize* = 1 shl 14
    ## Size of blob leaves.
  blobLeafSizeMask* = not(not(0) shl 14)
  visualLen = 32 * 3

  maxChunkSize* {.deprecated} = blobLeafSize

type
  Blake2b256* = Blake2bContext[256]

  BlobId* = MDigest[Blake2b256.bits]
    ## Blob Identifier
  SetId* = MDigest[Blake2b256.bits]
    ## Set Identifier

  Cid* {.deprecated} = BlobId

func `$`*(bh: BlobId): string =
  ## Convert a blob hash to a visual representation.
  const baseRune = 0x2800
  result = newString(visualLen)
  var pos = 0
  for b in bh.data.items:
    let r = (Rune)baseRune or b.int
    fastToUTF8Copy(r, result, pos, true)

func toBlobId*(s: string): BlobId =
  ## Parse a visual blob hash to binary.
  if s.len == visualLen:
    var
      pos: int
      r: Rune
    for b in result.data.mitems:
      fastRuneAt(s, pos, r, true)
      b = r.byte

proc `==`*(x, y: BlobId): bool = x.data == y.data
  ## Compare two BlobIds.

proc `==`*(cbor: CborNode; cid: BlobId): bool =
  ## Compare a CBOR node with a BlobId.
  if cbor.kind == cborBytes:
    for i in 0..<digestLen:
      if cid.data[i] != cbor.bytes[i].uint8:
        return false
    result = true

proc hash*(cid: BlobId): Hash =
  ## Reduce a BlobId into an integer for use in tables.
  var zeroKey: Key
  result = cast[Hash](sipHash(cid.data, zeroKey))

proc toCbor*(cid: BlobId): CborNode = newCborBytes cid.data
  ## Generate a CBOR representation of a BlobId.

proc toBlobId*(cbor: CborNode): BlobId =
  ## Generate a CBOR representation of a BlobId.
  assert(cbor.bytes.len == digestLen)
  for i in 0..<digestLen:
    result.data[i] = cbor.bytes[i].uint8

{.deprecated: [newCborBytes: toCbor].}

proc toHex*(cid: BlobId): string = hex.encode(cid.data)
  ## Return BlobId encoded in hexidecimal.

proc writeUvarint*(s: Stream; n: SomeInteger) =
  ## Write an IPFS varint
  var n = n
  while true:
    let c = int8(n and 0x7f)
    n = n shr 7
    if n == 0:
      s.write((char)c.char)
      break
    else:
      s.write((char)c or 0x80)

proc readUvarint*(s: Stream): BiggestInt =
  ## Read an IPFS varint
  var shift: int
  while shift < (9*8):
    let c = (BiggestInt)s.readChar
    result = result or ((c and 0x7f) shl shift)
    if (c and 0x80) == 0:
      break
    shift.inc 7

proc toIpfs*(cid: BlobId): string =
  ## Return BlobId encoded in IPFS multimulti.
  const
    multiRaw = 0x55
    multiBlake2b_256 = 0xb220
  let s = newStringStream()
  s.writeUvarint 1
  s.writeUvarint multiRaw
  s.writeUvarint multi_blake2b_256
  s.writeUvarint digestLen
  for e in cid.data:
    s.write e
  s.setPosition 0
  result = 'z' & bitcoin.encode(s.readAll)
  close s

const
  zeroChunk* = "8ddb61928ec76e4ee904cd79ed977ab6f5d9187f1102975060a6ba6ce10e5481".toDigest
    ## BlobId of zero chunk of maximum size.

proc take*(cid: var BlobId; buf: var string) =
  ## Take a raw digest from a string buffer.
  doAssert(buf.len == digestLen)
  copyMem(cid.data[0].addr, buf[0].addr, digestLen)

proc dagHash*(buf: pointer; len: Natural): BlobId =
  ## Generate a BlobId for a string of data using the BLAKE2b hash algorithm.
  assert(len <= maxChunkSize)
  var b: Blake2b256
  init(b)
  update(b, buf, len)
  b.finish()

proc dagHash*(data: string): BlobId =
  ## Generate a BlobId for a string of data using the BLAKE2b hash algorithm.
  assert(data.len <= maxChunkSize)
  var b: Blake2b256
  init(b)
  update(b, data)
  b.finish()

proc verify*(cid: BlobId; data: string): bool =
  ## Verify that a string of data corresponds to a BlobId.
  var b: Blake2b256
  init(b)
  update(b, data)
  finish(b) == cid

iterator simpleChunks*(s: Stream; size = maxChunkSize): string =
  ## Iterator that breaks a stream into simple chunks.
  doAssert(size <= maxChunkSize)
  var tmp = newString(size)
  while not s.atEnd:
    tmp.setLen(size)
    tmp.setLen(s.readData(tmp[0].addr, size))
    yield tmp

func isNonZero*(bh: BlobId): bool =
  ## Test if a blob hash is not zeroed.
  var r: byte
  for b in bh.data.items:
    {.unroll.}
    r = r or b
  r != 0

{.deprecated: [isValid: isNonZero].}

type
  Key = int64

const
  keyBits = sizeof(Key) shl 3
  keyChunkBits = fastLog2 keyBits
  keyChunkMask = not ((not 0.Key) shl (keyChunkBits))

func toKey(s: string): Key =
  var key: siphash.Key
  let b = sipHash(toOpenArrayByte(s, s.low, s.high), key)
  cast[Key](b)

func toCbor(k: Key): CborNode =
  ## Keys are endian independent.
  newCborBytes cast[array[sizeof(k), byte]](k)

type
  setKind* = enum hotNode, coldNode, leafNode
  BlobSet* = ref BlobSetObj
  BlobSetObj = object
    case kind*: setKind
    of hotNode:
      bitmap: Key
      table*: seq[BlobSet]
    of coldNode:
      setId*: SetId
    of leafNode:
      key: Key
      blob: BlobId
      size: BiggestInt

func newBlobSet*(): BlobSet =
  BlobSet(kind: hotNode, table: newSeqOfCap[BlobSet](2))

func sparseIndex(x: Key): int = int(x and keyChunkMask)

func compactIndex(t: BlobSet; x: Key): int =
  if (x and keyChunkMask) != 0:
    # TODO: bug in shr and shl, cannot shift all bits out
    result = (int)countSetBits(t.bitmap shl (keyBits - x.sparseIndex))

func masked(t: BlobSet; x: Key): bool =
  ((t.bitmap shr x.sparseIndex) and 1) != 0

func nodeCount*(bs: BlobSet): int =
  ## Count of internal nodes in set.
  result = 1
  for n in bs.table:
    assert(n.kind != coldNode, "cannot count cold nodes")
    if n.kind == hotNode:
      result.inc n.nodeCount

func leafCount*(bs: BlobSet): int =
  ## Count of leaves in set.
  for n in bs.table:
    assert(n.kind != coldNode, "cannot count leaves of cold nodes")
    if n.kind == leafNode:
      result.inc 1
    else:
      result.inc n.leafCount

func search*(t: BlobSet; name: string): BlobId =
  var
    t = t
    key = name.toKey
  while true:
    assert(key != 0, "keyspace exhausted during search")
    if t.masked(key):
      t = t.table[t.compactIndex(key)]
      if t.kind == leafNode:
        result = t.blob
        break
      key = key shr keyChunkBits
    else:
      raise newException(KeyError, "blob set does not contain key")

func insert(t, l: BlobSet; depth: int) =
  ## This procedure is recursive to a depth of keyBits/keyChunkBits.
  doAssert(depth < (keyBits div keyChunkBits), "key space exhausted during insert")
  let key = l.key shr (depth * keyChunkBits)
  if t.masked(key):
    let
      depth = depth + 1
      i = t.compactIndex(key)
    case t.table[i].kind
    of hotNode:
      t.table[i].insert(l, depth)
    of coldNode:
      raiseAssert("cannot insert into cold node")
    of leafNode:
      if t.table[i].key == l.key:
        raise newException(KeyError, "key collision in blob set")
      let
        subtrei = newBlobSet()
      subtrei.insert(t.table[i], depth)
      subtrei.insert(l, depth)
      t.table[i] = subtrei
  else:
    t.bitmap = t.bitmap or (Key(1) shl key.sparseIndex)
    t.table.insert(l, t.compactIndex(key))

func insert*(t: BlobSet; name: string; blob: BlobId; size: BiggestInt) =
  ## Insert a blob hash into a trie.
  let leaf = BlobSet(kind: leafNode, key: name.toKey, blob: blob, size: size)
  insert(t, leaf, 0)

func isEmpty*(s: BlobSet): bool = s.bitmap == Key(0)
  ## Test if a set is empty.

func toCbor*(x: BlobSet): CborNode =
  const
    nodeTag = 0
    leafTag = 1
  let array = newCborArray()
  case x.kind
  of hotNode:
    var
      map = x.bitmap
      buf = newCborBytes(sizeof(Key))
    when not sizeof(Key) == 8:
      {.error: "unknown key conversion".}
    bigEndian64(buf.bytes[0].addr, map.addr)
    array.add buf
    for y in x.table:
      array.add y.toCbor
    newCborTag(nodeTag, array)
  of coldNode:
    array.add x.setId.data
    newCborTag(nodeTag, array)
  of leafNode:
    array.add x.key.toCbor
    array.add x.blob.data
    array.add x.size
    newCborTag(leafTag, array)
Ingest and dump blobs 2018-12-21 03:50:36 +01:00			`import std/hashes, std/streams, std/strutils, std/bitops, std/unicode, std/endians`
			`import base58/bitcoin, cbor, siphash`
			`import ./blobsets/priv/hex`

			`import nimcrypto, nimcrypto/blake2`

			`const`
			`digestLen* = 32`
			`## Length of a chunk digest.`
			`cidSize* = digestLen`
			`## Size of CID object in memory`
			`blobLeafSize* = 1 shl 14`
			`## Size of blob leaves.`
			`blobLeafSizeMask* = not(not(0) shl 14)`
			`visualLen = 32 * 3`

			`maxChunkSize* {.deprecated} = blobLeafSize`

			`type`
			`Blake2b256* = Blake2bContext[256]`

			`BlobId* = MDigest[Blake2b256.bits]`
			`## Blob Identifier`
			`SetId* = MDigest[Blake2b256.bits]`
			`## Set Identifier`

			`Cid* {.deprecated} = BlobId`

			func `$`*(bh: BlobId): string =
			`## Convert a blob hash to a visual representation.`
			`const baseRune = 0x2800`
			`result = newString(visualLen)`
			`var pos = 0`
			`for b in bh.data.items:`
			`let r = (Rune)baseRune or b.int`
			`fastToUTF8Copy(r, result, pos, true)`

			`func toBlobId*(s: string): BlobId =`
			`## Parse a visual blob hash to binary.`
			`if s.len == visualLen:`
			`var`
			`pos: int`
			`r: Rune`
			`for b in result.data.mitems:`
			`fastRuneAt(s, pos, r, true)`
			`b = r.byte`

			proc `==`*(x, y: BlobId): bool = x.data == y.data
			`## Compare two BlobIds.`

			proc `==`*(cbor: CborNode; cid: BlobId): bool =
			`## Compare a CBOR node with a BlobId.`
			`if cbor.kind == cborBytes:`
			`for i in 0..<digestLen:`
			`if cid.data[i] != cbor.bytes[i].uint8:`
			`return false`
			`result = true`

			`proc hash*(cid: BlobId): Hash =`
			`## Reduce a BlobId into an integer for use in tables.`
			`var zeroKey: Key`
			`result = cast[Hash](sipHash(cid.data, zeroKey))`

			`proc toCbor*(cid: BlobId): CborNode = newCborBytes cid.data`
			`## Generate a CBOR representation of a BlobId.`

			`proc toBlobId*(cbor: CborNode): BlobId =`
			`## Generate a CBOR representation of a BlobId.`
			`assert(cbor.bytes.len == digestLen)`
			`for i in 0..<digestLen:`
			`result.data[i] = cbor.bytes[i].uint8`

			`{.deprecated: [newCborBytes: toCbor].}`

			`proc toHex*(cid: BlobId): string = hex.encode(cid.data)`
			`## Return BlobId encoded in hexidecimal.`

			`proc writeUvarint*(s: Stream; n: SomeInteger) =`
			`## Write an IPFS varint`
			`var n = n`
			`while true:`
			`let c = int8(n and 0x7f)`
			`n = n shr 7`
			`if n == 0:`
			`s.write((char)c.char)`
			`break`
			`else:`
			`s.write((char)c or 0x80)`

			`proc readUvarint*(s: Stream): BiggestInt =`
			`## Read an IPFS varint`
			`var shift: int`
			`while shift < (9*8):`
			`let c = (BiggestInt)s.readChar`
			`result = result or ((c and 0x7f) shl shift)`
			`if (c and 0x80) == 0:`
			`break`
			`shift.inc 7`

			`proc toIpfs*(cid: BlobId): string =`
			`## Return BlobId encoded in IPFS multimulti.`
			`const`
			`multiRaw = 0x55`
			`multiBlake2b_256 = 0xb220`
			`let s = newStringStream()`
			`s.writeUvarint 1`
			`s.writeUvarint multiRaw`
			`s.writeUvarint multi_blake2b_256`
			`s.writeUvarint digestLen`
			`for e in cid.data:`
			`s.write e`
			`s.setPosition 0`
			`result = 'z' & bitcoin.encode(s.readAll)`
			`close s`

			`const`
			`zeroChunk* = "8ddb61928ec76e4ee904cd79ed977ab6f5d9187f1102975060a6ba6ce10e5481".toDigest`
			`## BlobId of zero chunk of maximum size.`

			`proc take*(cid: var BlobId; buf: var string) =`
			`## Take a raw digest from a string buffer.`
			`doAssert(buf.len == digestLen)`
			`copyMem(cid.data[0].addr, buf[0].addr, digestLen)`

			`proc dagHash*(buf: pointer; len: Natural): BlobId =`
			`## Generate a BlobId for a string of data using the BLAKE2b hash algorithm.`
			`assert(len <= maxChunkSize)`
			`var b: Blake2b256`
			`init(b)`
			`update(b, buf, len)`
			`b.finish()`

			`proc dagHash*(data: string): BlobId =`
			`## Generate a BlobId for a string of data using the BLAKE2b hash algorithm.`
			`assert(data.len <= maxChunkSize)`
			`var b: Blake2b256`
			`init(b)`
			`update(b, data)`
			`b.finish()`

			`proc verify*(cid: BlobId; data: string): bool =`
			`## Verify that a string of data corresponds to a BlobId.`
			`var b: Blake2b256`
			`init(b)`
			`update(b, data)`
			`finish(b) == cid`

			`iterator simpleChunks*(s: Stream; size = maxChunkSize): string =`
			`## Iterator that breaks a stream into simple chunks.`
			`doAssert(size <= maxChunkSize)`
			`var tmp = newString(size)`
			`while not s.atEnd:`
			`tmp.setLen(size)`
			`tmp.setLen(s.readData(tmp[0].addr, size))`
			`yield tmp`

			`func isNonZero*(bh: BlobId): bool =`
			`## Test if a blob hash is not zeroed.`
			`var r: byte`
			`for b in bh.data.items:`
			`{.unroll.}`
			`r = r or b`
			`r != 0`

			`{.deprecated: [isValid: isNonZero].}`

			`type`
			`Key = int64`

			`const`
			`keyBits = sizeof(Key) shl 3`
			`keyChunkBits = fastLog2 keyBits`
			`keyChunkMask = not ((not 0.Key) shl (keyChunkBits))`

			`func toKey(s: string): Key =`
			`var key: siphash.Key`
			`let b = sipHash(toOpenArrayByte(s, s.low, s.high), key)`
			`cast[Key](b)`

			`func toCbor(k: Key): CborNode =`
			`## Keys are endian independent.`
			`newCborBytes cast[array[sizeof(k), byte]](k)`

			`type`
			`setKind* = enum hotNode, coldNode, leafNode`
			`BlobSet* = ref BlobSetObj`
			`BlobSetObj = object`
			`case kind*: setKind`
			`of hotNode:`
			`bitmap: Key`
			`table*: seq[BlobSet]`
			`of coldNode:`
			`setId*: SetId`
			`of leafNode:`
			`key: Key`
			`blob: BlobId`
			`size: BiggestInt`

			`func newBlobSet*(): BlobSet =`
			`BlobSet(kind: hotNode, table: newSeqOfCap[BlobSet](2))`

			`func sparseIndex(x: Key): int = int(x and keyChunkMask)`

			`func compactIndex(t: BlobSet; x: Key): int =`
			`if (x and keyChunkMask) != 0:`
			`# TODO: bug in shr and shl, cannot shift all bits out`
			`result = (int)countSetBits(t.bitmap shl (keyBits - x.sparseIndex))`

			`func masked(t: BlobSet; x: Key): bool =`
			`((t.bitmap shr x.sparseIndex) and 1) != 0`

			`func nodeCount*(bs: BlobSet): int =`
			`## Count of internal nodes in set.`
			`result = 1`
			`for n in bs.table:`
			`assert(n.kind != coldNode, "cannot count cold nodes")`
			`if n.kind == hotNode:`
			`result.inc n.nodeCount`

			`func leafCount*(bs: BlobSet): int =`
			`## Count of leaves in set.`
			`for n in bs.table:`
			`assert(n.kind != coldNode, "cannot count leaves of cold nodes")`
			`if n.kind == leafNode:`
			`result.inc 1`
			`else:`
			`result.inc n.leafCount`

			`func search*(t: BlobSet; name: string): BlobId =`
			`var`
			`t = t`
			`key = name.toKey`
			`while true:`
			`assert(key != 0, "keyspace exhausted during search")`
			`if t.masked(key):`
			`t = t.table[t.compactIndex(key)]`
			`if t.kind == leafNode:`
			`result = t.blob`
			`break`
			`key = key shr keyChunkBits`
			`else:`
			`raise newException(KeyError, "blob set does not contain key")`

			`func insert(t, l: BlobSet; depth: int) =`
			`## This procedure is recursive to a depth of keyBits/keyChunkBits.`
			`doAssert(depth < (keyBits div keyChunkBits), "key space exhausted during insert")`
			`let key = l.key shr (depth * keyChunkBits)`
			`if t.masked(key):`
			`let`
			`depth = depth + 1`
			`i = t.compactIndex(key)`
			`case t.table[i].kind`
			`of hotNode:`
			`t.table[i].insert(l, depth)`
			`of coldNode:`
			`raiseAssert("cannot insert into cold node")`
			`of leafNode:`
			`if t.table[i].key == l.key:`
			`raise newException(KeyError, "key collision in blob set")`
			`let`
			`subtrei = newBlobSet()`
			`subtrei.insert(t.table[i], depth)`
			`subtrei.insert(l, depth)`
			`t.table[i] = subtrei`
			`else:`
			`t.bitmap = t.bitmap or (Key(1) shl key.sparseIndex)`
			`t.table.insert(l, t.compactIndex(key))`

			`func insert*(t: BlobSet; name: string; blob: BlobId; size: BiggestInt) =`
			`## Insert a blob hash into a trie.`
			`let leaf = BlobSet(kind: leafNode, key: name.toKey, blob: blob, size: size)`
			`insert(t, leaf, 0)`

			`func isEmpty*(s: BlobSet): bool = s.bitmap == Key(0)`
			`## Test if a set is empty.`

			`func toCbor*(x: BlobSet): CborNode =`
			`const`
			`nodeTag = 0`
			`leafTag = 1`
			`let array = newCborArray()`
			`case x.kind`
			`of hotNode:`
			`var`
			`map = x.bitmap`
			`buf = newCborBytes(sizeof(Key))`
			`when not sizeof(Key) == 8:`
			`{.error: "unknown key conversion".}`
			`bigEndian64(buf.bytes[0].addr, map.addr)`
			`array.add buf`
			`for y in x.table:`
			`array.add y.toCbor`
			`newCborTag(nodeTag, array)`
			`of coldNode:`
			`array.add x.setId.data`
			`newCborTag(nodeTag, array)`
			`of leafNode:`
			`array.add x.key.toCbor`
			`array.add x.blob.data`
			`array.add x.size`
			`newCborTag(leafTag, array)`