Source code for dag.codecs.dag_cbor

"""DAG-CBOR codec - deterministic CBOR with IPLD CID links.

Multicodec code: ``0x71``

DAG-CBOR is CBOR (RFC 8949) with additional constraints:

1. **Deterministic encoding**: map keys sorted by byte length then
   lexicographically (RFC 7049 §3.9 canonical ordering), smallest
   possible integer representations, no indefinite-length items.
2. **CID links**: CIDs are encoded using CBOR tag 42 wrapping the
   CID bytes prefixed with a ``0x00`` identity multibase byte.
3. **No additional CBOR tags** are allowed (only tag 42).

Reference implementations:
- https://github.com/ipld/js-dag-cbor
- https://ipld.io/specs/codecs/dag-cbor/spec/
"""

from __future__ import annotations

from typing import Any

import cbor2
from cid import from_bytes as cid_from_bytes
from cid import make_cid

from ..codec import BlockCodec, register_codec
from ..ipld_model import CID, IPLDNode, is_cid
from ..multicodec_codes import DAG_CBOR_CODE, DAG_CBOR_NAME

_CID_CBOR_TAG = 42

_MULTIBASE_IDENTITY = b"\x00"


[docs] class DagCborCodec(BlockCodec): """DAG-CBOR codec (``0x71``). Encodes IPLD data-model values into deterministic CBOR with CID links represented as CBOR tag 42. """ @property def name(self) -> str: return DAG_CBOR_NAME @property def code(self) -> int: return DAG_CBOR_CODE
[docs] def encode(self, node: IPLDNode) -> bytes: """Encode an IPLD value to DAG-CBOR bytes. CIDs are encoded as ``Tag(42, 0x00 || cid_bytes)``. Map keys are sorted using canonical CBOR ordering. """ prepared = _prepare_for_cbor(node) return cbor2.dumps( prepared, canonical=True, )
[docs] def decode(self, data: bytes) -> IPLDNode: """Decode DAG-CBOR bytes into an IPLD value. CBOR tag 42 values are converted back into CID objects. """ raw = cbor2.loads(data, tag_hook=_tag_hook) return _restore_from_cbor(raw)
def _prepare_for_cbor(node: Any) -> Any: """Recursively convert IPLD values for cbor2 serialization. - CIDs → ``CBORTag(42, b'\\x00' + cid_bytes)`` - Dicts/lists are traversed recursively. - Other scalars pass through unchanged. """ if is_cid(node): cid_bytes = node.buffer return cbor2.CBORTag(_CID_CBOR_TAG, _MULTIBASE_IDENTITY + cid_bytes) if isinstance(node, dict): return {k: _prepare_for_cbor(v) for k, v in node.items()} if isinstance(node, list): return [_prepare_for_cbor(item) for item in node] if isinstance(node, (bytes, bytearray)): return bytes(node) return node def _tag_hook(decoder: Any, tag: cbor2.CBORTag) -> Any: """Handle CBOR tags during decoding. Only tag 42 (CID) is supported in DAG-CBOR. """ if tag.tag == _CID_CBOR_TAG: cid_bytes = tag.value if isinstance(cid_bytes, bytes) and cid_bytes[:1] == _MULTIBASE_IDENTITY: cid_bytes = cid_bytes[1:] return _decode_cid_bytes(cid_bytes) raise ValueError(f"Unsupported CBOR tag {tag.tag} in DAG-CBOR (only tag 42 is allowed)") def _decode_cid_bytes(raw_bytes: bytes) -> CID: """Decode raw CID bytes into a ``CIDv0`` or ``CIDv1`` object. CIDv1 bytes start with a version byte (``0x01``). CIDv0 bytes are just a raw multihash (starting with the hash function code, e.g. ``0x12`` for sha2-256). The py-cid ``from_bytes`` works for CIDv1 (which has a leading version byte 0 or 1). For CIDv0 raw multihash bytes we need to construct the CID directly. .. note:: Version assumption Currently only CIDv0 and CIDv1 exist. Any first byte that is not ``0x01`` (CIDv1) is treated as a raw multihash and wrapped as CIDv0 (``dag-pb``). If a CIDv2 is ever introduced this logic will need to be revisited. The identity hash function code (``0x00``) is *not* a valid CIDv0 multihash prefix in practice (CIDv0 always uses sha2-256, code ``0x12``), so the current heuristic is safe for all real-world data. """ if len(raw_bytes) < 2: raise ValueError("CID bytes too short") # CIDv1: first byte is the version marker 0x01 if raw_bytes[0] == 0x01: return cid_from_bytes(raw_bytes) # CIDv0: raw multihash bytes (no version prefix). CIDv0 is # always dag-pb with sha2-256, so the first byte should be 0x12. return make_cid(0, "dag-pb", raw_bytes) def _restore_from_cbor(node: Any) -> Any: """Recursively restore IPLD values after cbor2 decoding. - CID objects are left as-is (already decoded by tag_hook). - Dicts/lists are traversed recursively. """ if is_cid(node): return node if isinstance(node, dict): return {k: _restore_from_cbor(v) for k, v in node.items()} if isinstance(node, list): return [_restore_from_cbor(item) for item in node] return node codec = DagCborCodec() """Module-level singleton codec instance.""" name = codec.name code = codec.code encode = codec.encode decode = codec.decode register_codec(codec)