diff --git a/dissect/database/chromium/__init__.py b/dissect/database/chromium/__init__.py new file mode 100644 index 0000000..fdaf54a --- /dev/null +++ b/dissect/database/chromium/__init__.py @@ -0,0 +1,8 @@ +from __future__ import annotations + +from dissect.database.chromium.cache import DiskCache, SimpleDiskCache + +__all__ = [ + "DiskCache", + "SimpleDiskCache", +] diff --git a/dissect/database/chromium/cache/__init__.py b/dissect/database/chromium/cache/__init__.py new file mode 100644 index 0000000..5fbbd3c --- /dev/null +++ b/dissect/database/chromium/cache/__init__.py @@ -0,0 +1,13 @@ +from __future__ import annotations + +from dissect.database.chromium.cache.c_cache import c_cache +from dissect.database.chromium.cache.c_simple import c_simple +from dissect.database.chromium.cache.cache import DiskCache +from dissect.database.chromium.cache.simple import SimpleDiskCache + +__all__ = [ + "DiskCache", + "SimpleDiskCache", + "c_cache", + "c_simple", +] diff --git a/dissect/database/chromium/cache/c_cache.py b/dissect/database/chromium/cache/c_cache.py new file mode 100755 index 0000000..a9950bb --- /dev/null +++ b/dissect/database/chromium/cache/c_cache.py @@ -0,0 +1,158 @@ +from __future__ import annotations + +from dissect.cstruct import cstruct + +# References: +# - https://chromium.googlesource.com/chromium/src/+/HEAD/net/disk_cache/blockfile/addr.h +# - https://chromium.googlesource.com/chromium/src/+/HEAD/net/disk_cache/blockfile/disk_format_base.h +# - https://chromium.googlesource.com/chromium/src/+/HEAD/net/disk_cache/blockfile/disk_format.h +cache_def = """ + +/* Cache Address format. */ + +enum FileType { + EXTERNAL = 0, + RANKINGS = 1, + BLOCK_256 = 2, + BLOCK_1K = 3, + BLOCK_4K = 4, + BLOCK_FILES = 5, + BLOCK_ENTRIES = 6, + BLOCK_EVICTED = 7 +}; + +// int kMaxBlockSize = 4096 * 4; +// int16_t kMaxBlockFile = 255; +// int kMaxNumBlocks = 4; +// int16_t kFirstAdditionalBlockFile = 4; + +#define kInitializedMask 0x80000000 +#define kFileTypeMask 0x70000000 +#define kFileTypeOffset 28 +#define kReservedBitsMask 0x0c000000 +#define kNumBlocksMask 0x03000000 +#define kNumBlocksOffset 24 +#define kFileSelectorMask 0x00ff0000 +#define kFileSelectorOffset 16 +#define kStartBlockMask 0x0000FFFF +#define kFileNameMask 0x0FFFFFFF + +/* Cache types. */ + +/* Index file format. */ +typedef uint32_t CacheAddr; +#define kIndexMagic 0xC103CAC3 + +struct LruData { + int32 padding_1[2]; + int32 filled; // Flag to tell when we filled the cache. + int32 sizes[5]; + CacheAddr heads[5]; + CacheAddr tails[5]; + CacheAddr transaction; // In-flight operation target. + int32 operation; // Actual in-flight operation. + int32 operation_list; // In-flight operation list. + int32 padding_2[7]; +}; + +struct IndexHeader { + uint32 magic; // 0xc3ca03c1 + uint32 version; + int32 num_entries; + int32 num_bytes_legacy; + int32 last_file; // f_###### + int32 dirty_flag; + CacheAddr stats; + int32 table_len; + int32 crash_flag; + int32 experiment_flag; + uint64 create_time; + int64 num_bytes; + int32 corruption_flag; + int32 padding[49]; + LruData lru_data; + // CacheAddr table[table_len]; // max is kIndexTablesize (0x10000) +}; + +/* Data Block File Format. */ +#define kBlockHeaderSize 8192 + +struct BlockFileHeader { + uint32 magic; // 0xc3ca04c1 + uint32 version; + int16 this_file; // Index of this file (data_#). + int16 next_file; // Next file when this one is full (data_#). + int32 entry_size; // Size of the blocks of this file. + int32 num_entries; // Number of stored entries. + int32 max_entries; // Current maximum number of entries. + int32 empty[4]; + int32 hints[4]; + int32 updating; + int32 user[5]; + // int32 allocation_map[2028]; + // total header should be exactly kBlockHeaderSize bytes long (8192). +}; + +/* Cache Entry Format. */ + +enum EntryState { + ENTRY_NORMAL = 0, + ENTRY_EVICTED, // The entry was recently evicted from the cache. + ENTRY_DOOMED // The entry was doomed. +}; + +enum EntryFlags { + PARENT_ENTRY = 1, // This entry has children (sparse) entries. + CHILD_ENTRY = 1 << 1 // Child entry that stores sparse data. +}; + +struct EntryStore { + uint32 hash; // Full hash of the key. + CacheAddr next; // Next entry with the same hash or bucket. + CacheAddr rankings_node; // Rankings node for this entry. + int32 reuse_count; // How often is this entry used. + int32 refetch_count; // How often is this fetched from the net. + int32 state; // Current state. + uint64 creation_time; + int32 key_len; + CacheAddr long_key; // Optional address of a long key. + + int32 data_size[4]; // We can store up to 4 data streams for + CacheAddr data_addr[4]; // each entry. + + uint32 flags; // Any combination of EntryFlags. + int32 padding[4]; + uint32 self_hash; // The hash of EntryStore up to this point. + + // char key[256 - 24 * 4]; // null terminated, continues in consecutive block if larger + char key[key_len]; // For convienience sake we read the entire key length, + // possibly exceeding the bounds of the entry store size. +}; +""" + +c_cache = cstruct(endian="<").load(cache_def) + + +def BlockSizeForFileType(file_type: int) -> int: + if file_type == 1: # RANKINGS + return 36 + + if file_type == 2: # BLOCK_256 + return 256 + + if file_type == 3: # BLOCK_1K + return 1024 + + if file_type == 4: # BLOCK_4K + return 4096 + + if file_type == 5: # BLOCK_FILES + return 8 + + if file_type == 6: # BLOCK_ENTRIES + return 104 + + if file_type == 7: # BLOCK_EVICETED + return 48 + + raise ValueError(f"Unknown file_type {file_type!r}") diff --git a/dissect/database/chromium/cache/c_cache.pyi b/dissect/database/chromium/cache/c_cache.pyi new file mode 100755 index 0000000..0c7dd40 --- /dev/null +++ b/dissect/database/chromium/cache/c_cache.pyi @@ -0,0 +1,172 @@ +# Generated by cstruct-stubgen +from typing import BinaryIO, Literal, TypeAlias, overload + +import dissect.cstruct as __cs__ + +class _c_cache(__cs__.cstruct): + kInitializedMask: Literal[2147483648] = ... + kFileTypeMask: Literal[1879048192] = ... + kFileTypeOffset: Literal[28] = ... + kReservedBitsMask: Literal[201326592] = ... + kNumBlocksMask: Literal[50331648] = ... + kNumBlocksOffset: Literal[24] = ... + kFileSelectorMask: Literal[16711680] = ... + kFileSelectorOffset: Literal[16] = ... + kStartBlockMask: Literal[65535] = ... + kFileNameMask: Literal[268435455] = ... + kIndexMagic: Literal[3238251203] = ... + kBlockHeaderSize: Literal[8192] = ... + class FileType(__cs__.Enum): + EXTERNAL = ... + RANKINGS = ... + BLOCK_256 = ... + BLOCK_1K = ... + BLOCK_4K = ... + BLOCK_FILES = ... + BLOCK_ENTRIES = ... + BLOCK_EVICTED = ... + + CacheAddr: TypeAlias = _c_cache.uint32 + class LruData(__cs__.Structure): + padding_1: __cs__.Array[_c_cache.int32] + filled: _c_cache.int32 + sizes: __cs__.Array[_c_cache.int32] + heads: __cs__.Array[_c_cache.uint32] + tails: __cs__.Array[_c_cache.uint32] + transaction: _c_cache.uint32 + operation: _c_cache.int32 + operation_list: _c_cache.int32 + padding_2: __cs__.Array[_c_cache.int32] + @overload + def __init__( + self, + padding_1: __cs__.Array[_c_cache.int32] | None = ..., + filled: _c_cache.int32 | None = ..., + sizes: __cs__.Array[_c_cache.int32] | None = ..., + heads: __cs__.Array[_c_cache.uint32] | None = ..., + tails: __cs__.Array[_c_cache.uint32] | None = ..., + transaction: _c_cache.uint32 | None = ..., + operation: _c_cache.int32 | None = ..., + operation_list: _c_cache.int32 | None = ..., + padding_2: __cs__.Array[_c_cache.int32] | None = ..., + ): ... + @overload + def __init__(self, fh: bytes | memoryview | bytearray | BinaryIO, /): ... + + class IndexHeader(__cs__.Structure): + magic: _c_cache.uint32 + version: _c_cache.uint32 + num_entries: _c_cache.int32 + num_bytes_legacy: _c_cache.int32 + last_file: _c_cache.int32 + dirty_flag: _c_cache.int32 + stats: _c_cache.uint32 + table_len: _c_cache.int32 + crash_flag: _c_cache.int32 + experiment_flag: _c_cache.int32 + create_time: _c_cache.uint64 + num_bytes: _c_cache.int64 + corruption_flag: _c_cache.int32 + padding: __cs__.Array[_c_cache.int32] + lru_data: _c_cache.LruData + @overload + def __init__( + self, + magic: _c_cache.uint32 | None = ..., + version: _c_cache.uint32 | None = ..., + num_entries: _c_cache.int32 | None = ..., + num_bytes_legacy: _c_cache.int32 | None = ..., + last_file: _c_cache.int32 | None = ..., + dirty_flag: _c_cache.int32 | None = ..., + stats: _c_cache.uint32 | None = ..., + table_len: _c_cache.int32 | None = ..., + crash_flag: _c_cache.int32 | None = ..., + experiment_flag: _c_cache.int32 | None = ..., + create_time: _c_cache.uint64 | None = ..., + num_bytes: _c_cache.int64 | None = ..., + corruption_flag: _c_cache.int32 | None = ..., + padding: __cs__.Array[_c_cache.int32] | None = ..., + lru_data: _c_cache.LruData | None = ..., + ): ... + @overload + def __init__(self, fh: bytes | memoryview | bytearray | BinaryIO, /): ... + + class BlockFileHeader(__cs__.Structure): + magic: _c_cache.uint32 + version: _c_cache.uint32 + this_file: _c_cache.int16 + next_file: _c_cache.int16 + entry_size: _c_cache.int32 + num_entries: _c_cache.int32 + max_entries: _c_cache.int32 + empty: __cs__.Array[_c_cache.int32] + hints: __cs__.Array[_c_cache.int32] + updating: _c_cache.int32 + user: __cs__.Array[_c_cache.int32] + @overload + def __init__( + self, + magic: _c_cache.uint32 | None = ..., + version: _c_cache.uint32 | None = ..., + this_file: _c_cache.int16 | None = ..., + next_file: _c_cache.int16 | None = ..., + entry_size: _c_cache.int32 | None = ..., + num_entries: _c_cache.int32 | None = ..., + max_entries: _c_cache.int32 | None = ..., + empty: __cs__.Array[_c_cache.int32] | None = ..., + hints: __cs__.Array[_c_cache.int32] | None = ..., + updating: _c_cache.int32 | None = ..., + user: __cs__.Array[_c_cache.int32] | None = ..., + ): ... + @overload + def __init__(self, fh: bytes | memoryview | bytearray | BinaryIO, /): ... + + class EntryState(__cs__.Enum): + ENTRY_NORMAL = ... + ENTRY_EVICTED = ... + ENTRY_DOOMED = ... + + class EntryFlags(__cs__.Enum): + PARENT_ENTRY = ... + CHILD_ENTRY = ... + + class EntryStore(__cs__.Structure): + hash: _c_cache.uint32 + next: _c_cache.uint32 + rankings_node: _c_cache.uint32 + reuse_count: _c_cache.int32 + refetch_count: _c_cache.int32 + state: _c_cache.int32 + creation_time: _c_cache.uint64 + key_len: _c_cache.int32 + long_key: _c_cache.uint32 + data_size: __cs__.Array[_c_cache.int32] + data_addr: __cs__.Array[_c_cache.uint32] + flags: _c_cache.uint32 + padding: __cs__.Array[_c_cache.int32] + self_hash: _c_cache.uint32 + key: __cs__.CharArray + @overload + def __init__( + self, + hash: _c_cache.uint32 | None = ..., + next: _c_cache.uint32 | None = ..., + rankings_node: _c_cache.uint32 | None = ..., + reuse_count: _c_cache.int32 | None = ..., + refetch_count: _c_cache.int32 | None = ..., + state: _c_cache.int32 | None = ..., + creation_time: _c_cache.uint64 | None = ..., + key_len: _c_cache.int32 | None = ..., + long_key: _c_cache.uint32 | None = ..., + data_size: __cs__.Array[_c_cache.int32] | None = ..., + data_addr: __cs__.Array[_c_cache.uint32] | None = ..., + flags: _c_cache.uint32 | None = ..., + padding: __cs__.Array[_c_cache.int32] | None = ..., + self_hash: _c_cache.uint32 | None = ..., + key: __cs__.CharArray | None = ..., + ): ... + @overload + def __init__(self, fh: bytes | memoryview | bytearray | BinaryIO, /): ... + +# Technically `c_cache` is an instance of `_c_cache`, but then we can't use it in type hints +c_cache: TypeAlias = _c_cache diff --git a/dissect/database/chromium/cache/c_simple.py b/dissect/database/chromium/cache/c_simple.py new file mode 100644 index 0000000..6b91022 --- /dev/null +++ b/dissect/database/chromium/cache/c_simple.py @@ -0,0 +1,87 @@ +from __future__ import annotations + +from dissect.cstruct import cstruct + +# References: +# - https://chromium.googlesource.com/chromium/src/+/HEAD/net/disk_cache/simple/simple_index_file.h +# - https://chromium.googlesource.com/chromium/src/+/HEAD/net/disk_cache/simple/simple_entry_format.h +simple_def = """ +/* Simple Indexes */ + +#define kSimpleIndexMagicNumber 0x656e74657220796f + +struct FakeIndexHeader { + uint64 magic; // kSimpleIndexMagicNumber + uint32 version; + int32 padding[2]; +}; + +struct IndexTableEntry { + uint64 hash; + int64 last_used; + int64 size; +}; + +struct RealIndexHeader { + uint32 size; + uint32 crc32; + uint64 magic; // kSimpleIndexMagicNumber + uint32 version; + int64 num_entries; + int64 cache_size; + int32 unknown; + IndexTableEntry entries[num_entries]; +}; + +/* Simple File Headers. */ + +#define kSimpleInitialMagicNumber 0xfcfb6d1ba7725c30 +#define kSimpleFinalMagicNumber 0xf4fa6f45970d41d8 + +struct SimpleFileHeader { + uint64 magic; // kSimpleInitialMagicNumber + uint32 version; + uint32 key_length; + uint32 key_hash; // md5 + uint32 unused_padding; + char key[key_length]; + + // followed by SimpleFileStream_* +}; + +#define kSimpleEOFSize 24 + +struct SimpleFileEOF { + uint64 magic; // kSimpleFinalMagicNumber + uint32 flags; // hash type: 0 = ?, 1 = crc32, 2 = sha256, 3 = 1 + 2 + uint32 crc32; + uint32 stream_size; // only used in the EOF record for stream 0. +}; + +struct SimpleFileStream_0_1 { + // preceded by SimpleFileHeader + // char data_stream_1[]; + // SimpleFileEOF + // char data_stream_0[]; + // SHA256 if flags = 2 or 3 + // SimpleFileEOF +}; + +struct SimpleFileStream_2 { + // preceded by SimpleFileHeader + // char data_stream_2[]; + // SimpleFileEOF +}; + +#define kSimpleSparseRangeMagicNumber 0xeb97bf016553676b + +struct SimpleFileSparseRangeHeader { + uint64 magic; // kSimpleSparseRangeMagicNumber + int64 offset; + int64 length; + uint32 crc32; + // char data[length]; +}; +""" + +c_simple = cstruct(endian="<").load(simple_def) diff --git a/dissect/database/chromium/cache/c_simple.pyi b/dissect/database/chromium/cache/c_simple.pyi new file mode 100644 index 0000000..6caaf71 --- /dev/null +++ b/dissect/database/chromium/cache/c_simple.pyi @@ -0,0 +1,149 @@ +# Generated by cstruct-stubgen +from typing import BinaryIO, Literal, TypeAlias, overload + +import dissect.cstruct as __cs__ + +class _c_simple(__cs__.cstruct): + kSimpleIndexMagicNumber: Literal[7308907224324143471] = ... + kSimpleInitialMagicNumber: Literal[18229283882253048880] = ... + kSimpleFinalMagicNumber: Literal[17652544034109735384] = ... + kSimpleEOFSize: Literal[24] = ... + kSimpleSparseRangeMagicNumber: Literal[16976247333112211307] = ... + class FakeIndexHeader(__cs__.Structure): + magic: _c_simple.uint64 + version: _c_simple.uint32 + padding: __cs__.Array[_c_simple.int32] + @overload + def __init__( + self, + magic: _c_simple.uint64 | None = ..., + version: _c_simple.uint32 | None = ..., + padding: __cs__.Array[_c_simple.int32] | None = ..., + ): ... + @overload + def __init__(self, fh: bytes | memoryview | bytearray | BinaryIO, /): ... + + class IndexTableEntry(__cs__.Structure): + hash: _c_simple.uint64 + last_used: _c_simple.int64 + size: _c_simple.int64 + @overload + def __init__( + self, + hash: _c_simple.uint64 | None = ..., + last_used: _c_simple.int64 | None = ..., + size: _c_simple.int64 | None = ..., + ): ... + @overload + def __init__(self, fh: bytes | memoryview | bytearray | BinaryIO, /): ... + + class RealIndexHeader(__cs__.Structure): + size: _c_simple.uint32 + crc32: _c_simple.uint32 + magic: _c_simple.uint64 + version: _c_simple.uint32 + num_entries: _c_simple.int64 + cache_size: _c_simple.int64 + unknown: _c_simple.int32 + class IndexTableEntry(__cs__.Structure): + hash: _c_simple.uint64 + last_used: _c_simple.int64 + size: _c_simple.int64 + @overload + def __init__( + self, + hash: _c_simple.uint64 | None = ..., + last_used: _c_simple.int64 | None = ..., + size: _c_simple.int64 | None = ..., + ): ... + @overload + def __init__(self, fh: bytes | memoryview | bytearray | BinaryIO, /): ... + + entries: __cs__.Array[IndexTableEntry] + @overload + def __init__( + self, + size: _c_simple.uint32 | None = ..., + crc32: _c_simple.uint32 | None = ..., + magic: _c_simple.uint64 | None = ..., + version: _c_simple.uint32 | None = ..., + num_entries: _c_simple.int64 | None = ..., + cache_size: _c_simple.int64 | None = ..., + unknown: _c_simple.int32 | None = ..., + entries: __cs__.Array[IndexTableEntry] | None = ..., + ): ... + @overload + def __init__(self, fh: bytes | memoryview | bytearray | BinaryIO, /): ... + + class SimpleFileHeader(__cs__.Structure): + magic: _c_simple.uint64 + version: _c_simple.uint32 + key_length: _c_simple.uint32 + key_hash: _c_simple.uint32 + unused_padding: _c_simple.uint32 + key: __cs__.CharArray + @overload + def __init__( + self, + magic: _c_simple.uint64 | None = ..., + version: _c_simple.uint32 | None = ..., + key_length: _c_simple.uint32 | None = ..., + key_hash: _c_simple.uint32 | None = ..., + unused_padding: _c_simple.uint32 | None = ..., + key: __cs__.CharArray | None = ..., + ): ... + @overload + def __init__(self, fh: bytes | memoryview | bytearray | BinaryIO, /): ... + + class SimpleFileEOF(__cs__.Structure): + magic: _c_simple.uint64 + flags: _c_simple.uint32 + crc32: _c_simple.uint32 + stream_size: _c_simple.int32 + @overload + def __init__( + self, + magic: _c_simple.uint64 | None = ..., + flags: _c_simple.uint32 | None = ..., + crc32: _c_simple.uint32 | None = ..., + stream_size: _c_simple.int32 | None = ..., + ): ... + @overload + def __init__(self, fh: bytes | memoryview | bytearray | BinaryIO, /): ... + + class SimpleFileStream_0_1(__cs__.Structure): + @overload + def __init__(self): ... + @overload + def __init__(self, fh: bytes | memoryview | bytearray | BinaryIO, /): ... + + class SimpleFileStream_2(__cs__.Structure): + @overload + def __init__(self): ... + @overload + def __init__(self, fh: bytes | memoryview | bytearray | BinaryIO, /): ... + + class SimpleFileSparseRangeHeader(__cs__.Structure): + magic: _c_simple.uint64 + offset: _c_simple.int64 + length: _c_simple.int64 + crc32: _c_simple.uint32 + @overload + def __init__( + self, + magic: _c_simple.uint64 | None = ..., + offset: _c_simple.int64 | None = ..., + length: _c_simple.int64 | None = ..., + crc32: _c_simple.uint32 | None = ..., + ): ... + @overload + def __init__(self, fh: bytes | memoryview | bytearray | BinaryIO, /): ... + + class SimpleFileStreamSparse(__cs__.Structure): + @overload + def __init__(self): ... + @overload + def __init__(self, fh: bytes | memoryview | bytearray | BinaryIO, /): ... + +# Technically `c_simple` is an instance of `_c_simple`, but then we can't use it in type hints +c_simple: TypeAlias = _c_simple diff --git a/dissect/database/chromium/cache/cache.py b/dissect/database/chromium/cache/cache.py new file mode 100755 index 0000000..c0c99a8 --- /dev/null +++ b/dissect/database/chromium/cache/cache.py @@ -0,0 +1,290 @@ +from __future__ import annotations + +import gzip +import zlib +from typing import TYPE_CHECKING, BinaryIO +from urllib.parse import urlsplit + +from dissect.cstruct.utils import u32 +from dissect.util.stream import RangeStream +from dissect.util.ts import webkittimestamp + +from dissect.database.chromium.cache.c_cache import BlockSizeForFileType, c_cache +from dissect.database.chromium.cache.util import parse_cache_key + +try: + from cramjam import brotli + + HAS_CRAMJAM = True + +except ImportError: + HAS_CRAMJAM = False + +if TYPE_CHECKING: + from collections.abc import Iterator + from io import BufferedReader + from pathlib import Path + + +class DiskCache: + """Chromium Disk (Block File) Cache implementation. + + References: + - https://www.chromium.org/developers/design-documents/network-stack/disk-cache/ + - https://github.com/libyal/dtformats/blob/main/documentation/Chrome%20Cache%20file%20format.asciidoc + """ + + def __init__(self, path: Path): + if not path.exists(): + raise ValueError(f"Provided path does not exist: {path!r}") + + if not path.is_dir(): + raise ValueError(f"Provided path is not a directory: {path!r}") + + # Sanity check for expected directory structure. + files = {"index", "data_0", "data_1", "data_2", "data_3"} + self.children = set(path.iterdir()) + if not files.issubset({file.name for file in self.children}): + raise ValueError(f"Provided directory does not contain expected disk cache files: {path!r}") + + self.path = path + self.index = CacheIndexFile(self, path.joinpath("index")) + + if self.index.header.magic != 0xC103CAC3: + raise ValueError(f"Provided directory contains invalid index file: {path!r}") + + if self.index.header.version != 0x30000: + raise ValueError(f"Unsupported Disk Cache index version {self.index.header.version!r} in {path!r}") + + self.create_time = webkittimestamp(self.index.header.create_time) + self.num_entries = self.index.header.num_entries + + self.block_files = [CacheBlockFile(self, file) for file in self.children if file.name.startswith("data_")] + + def __repr__(self) -> str: + return f"" + + def block_file(self, id: int) -> CacheBlockFile | None: + for block_file in self.block_files: + if block_file.id == id: + return block_file + return None + + def entries(self) -> Iterator[CacheEntryStore]: + seen: set[int] = set() + + # Iterate entries referenced in the index + for address in self.index.addresses: + while address.is_initialized: + entry = CacheEntryStore(self, address) + seen.add(entry.header.hash) + yield entry + + # An EntryStore can point to a next address for another EntryStore + if entry.next == 0: + break + address = CacheAddress(self.index, entry.next) + + # Sometimes the index is not used, so we have to iterate all block files manually. + # TODO: Calculate store.header.hash to see if the entry is valid. + for block_file in self.block_files: + for i in range(block_file.header.max_entries): + entry_offset = c_cache.kBlockHeaderSize + (i * block_file.entry_size) + block_file.fh.seek(entry_offset) + try: + store = CacheEntryStore(self, None, block_file.fh) + if ( + store.state in (0, 1, 2) + and not any(store.header.padding) + and store.key + and store.header.hash not in seen + ): + yield store + except Exception: + pass + + def get_key(self, key: str) -> CacheEntryStore | None: + """Get the :class:`CacheEntryStore` for the given ``key``.""" + for entry in self.entries(): + if key == entry.key: + return entry + return None + + def get_url(self, resource_url: str) -> CacheEntryStore | None: + """Get the :class:`CacheEntrystore` for the given resource url.""" + for entry in self.entries(): + if resource_url == entry.resource_url: + return entry + return None + + def get_host(self, host: str) -> Iterator[CacheEntryStore]: + """Get all :class:`CacheEntryStore` for the given host.""" + for entry in self.entries(): + if urlsplit(entry.resource_url).hostname == host: + yield entry + + +class CacheIndexFile: + """Chromium Disk Cache Index file. + + References: + - https://chromium.googlesource.com/chromium/src/+/HEAD/net/disk_cache/blockfile/disk_format.h + """ + + def __init__(self, disk_cache: DiskCache, path: Path): + self.disk_cache = disk_cache + self.path = path + + self.fh = path.open("rb") + self.header = c_cache.IndexHeader(self.fh) + self.addr_offset = self.fh.tell() + + if self.header.magic != c_cache.kIndexMagic: + raise ValueError(f"Unexpected index header {self.header.magic!r}") + + def __repr__(self) -> str: + return f"" + + @property + def addresses(self) -> list[CacheAddress]: + """Yield :class:`CacheAddress` from the index table.""" + addresses = [] + self.fh.seek(self.addr_offset) + for _ in range(self.header.table_len): + addr = CacheAddress(self, u32(self.fh.read(4))) + addresses.append(addr) + return addresses + + +class CacheBlockFile: + """Chromium Disk Cache Data Block file. + + References: + - https://chromium.googlesource.com/chromium/src/+/HEAD/net/disk_cache/blockfile/disk_format.h + """ + + def __init__(self, disk_cache: DiskCache, path: Path): + self.disk_cache = disk_cache + self.path = path + + self.fh = path.open("rb") + self.header = c_cache.BlockFileHeader(self.fh) + + self.id = self.header.this_file + self.entry_size = self.header.entry_size + self.num_entries = self.header.num_entries + + def __repr__(self) -> str: + return f"" # noqa: E501 + + def read(self, addr: CacheAddress) -> RangeStream: + offset = c_cache.kBlockHeaderSize + (self.entry_size * addr.start_block) + size = self.entry_size * addr.num_blocks + return RangeStream(self.fh, offset, size) + + +class CacheAddress: + """Chromium Disk Cache Address. + + References: + - https://chromium.googlesource.com/chromium/src/+/HEAD/net/disk_cache/blockfile/addr.h + """ + + def __init__(self, index: CacheIndexFile, addr: int): + self.index = index + self.address = addr + + self.is_initialized = addr & c_cache.kInitializedMask != 0 + self.file_type = c_cache.FileType((addr & c_cache.kFileTypeMask) >> c_cache.kFileTypeOffset) + self.is_separate_file = (addr & c_cache.kFileTypeMask) == 0 + self.is_block_file = not self.is_separate_file + + if self.is_separate_file: + self.file_number = addr & c_cache.kFileNameMask + self.block_size = None + self.num_blocks = None + self.start_block = None + else: + self.file_number = (addr & c_cache.kFileSelectorMask) >> c_cache.kFileSelectorOffset + self.block_size = BlockSizeForFileType(self.file_type.value) + self.num_blocks = 1 + ((addr & c_cache.kNumBlocksMask) >> c_cache.kNumBlocksOffset) + self.start_block = addr & c_cache.kStartBlockMask + + def __repr__(self) -> str: + return f"" # noqa: E501 + + @property + def data(self) -> BufferedReader | RangeStream: + if not self.is_initialized: + raise ValueError("Cannot read data from non initialized address") + + if self.file_type == c_cache.FileType.EXTERNAL: + file_name = f"f_{self.file_number:06x}" + path = self.index.disk_cache.path.joinpath(file_name) + return path.open("rb") + + if self.file_type in (c_cache.FileType.BLOCK_256, c_cache.FileType.BLOCK_1K, c_cache.FileType.BLOCK_4K): + block_file = self.index.disk_cache.block_file(self.file_number) + if not block_file: + raise ValueError(f"Requested block file {self.file_number!r} does not exist") + return block_file.read(self) + + raise ValueError(f"No data for file type {self.file_type!r}") + + +class CacheEntryStore: + """Represents a Cache EntryStore object.""" + + def __init__(self, disk_cache: DiskCache, addr: CacheAddress | None, data: BinaryIO | None = None): + if not addr and not data: + raise ValueError("addr or data required") + + self.disk_cache = disk_cache + self.address = addr + + header_data = addr.data if isinstance(addr, CacheAddress) else data + self.header = c_cache.EntryStore(header_data) + self.state = c_cache.EntryState(self.header.state) + self.creation_time = webkittimestamp(self.header.creation_time) + self.next = self.header.next + + if self.header.long_key: + key_addr = CacheAddress(disk_cache.index, self.header.long_key) + self.key = key_addr.data.read(self.header.key_len).decode() + else: + self.key = self.header.key.decode().strip("\x00") + + self.credential_key, self.upload_data_identifier, self.isolation_key, self.resource_url = parse_cache_key( + self.key + ) + + def __repr__(self): + return f"" # noqa: E501 + + @property + def meta(self) -> bytes: + addr = CacheAddress(self.disk_cache.index, self.header.data_addr[0]) + size = self.header.data_size[0] + # TODO: Properly unpickle HTTP response headers + return addr.data.read(size) + + @property + def data(self) -> bytes: + addr = CacheAddress(self.disk_cache.index, self.header.data_addr[1]) + size = self.header.data_size[1] + header = addr.data.read(4) + + if header[0:2] == b"\x1f\x8b": + return gzip.decompress(addr.data.read(size)) + + meta = self.meta + if b"content-encoding:br" in meta: + if not HAS_CRAMJAM: + raise RuntimeError("Missing required dependency cramjam to decode brotli data") + + return brotli.decompress(addr.data.read(size)).read() + + if b"content-encoding:deflate" in meta: + return zlib.decompress(addr.data.read(size), -zlib.MAX_WBITS) + + return addr.data.read(size) diff --git a/dissect/database/chromium/cache/simple.py b/dissect/database/chromium/cache/simple.py new file mode 100755 index 0000000..565a74f --- /dev/null +++ b/dissect/database/chromium/cache/simple.py @@ -0,0 +1,242 @@ +from __future__ import annotations + +import gzip +import os +import zlib +from enum import IntEnum +from typing import TYPE_CHECKING +from urllib.parse import urlsplit + +from dissect.util.ts import webkittimestamp + +from dissect.database.chromium.cache.c_simple import c_simple +from dissect.database.chromium.cache.util import parse_cache_key + +try: + from cramjam import brotli + + HAS_CRAMJAM = True + +except ImportError: + HAS_CRAMJAM = False + +if TYPE_CHECKING: + from collections.abc import Iterator + from pathlib import Path + + +class SimpleDiskCache: + """Chromium Very Simple Disk Cache Backend implementation. + + References: + - https://www.chromium.org/developers/design-documents/network-stack/disk-cache/very-simple-backend/ + - https://chromium.googlesource.com/chromium/src/+/HEAD/net/disk_cache/simple/ + """ + + def __init__(self, path: Path): + if not path.exists(): + raise ValueError(f"Provided path does not exist: {path!r}") + + if not path.is_dir(): + raise ValueError(f"Provided path is not a directory: {path!r}") + + # Sanity check for expected directory structure. + files = {"index-dir", "index"} + self.children = set(path.iterdir()) + if not files.issubset({file.name for file in self.children}): + raise ValueError(f"Provided directory does not contain expected disk cache files: {path!r}") + + self.path = path + self.index = SimpleIndexFile(self, path.joinpath("index-dir/the-real-index")) + self.last_used = self.index.last_used + self.cache_files = [child for child in self.children if len(child.name) == 18 and "_" in child.name] + + def __repr__(self) -> str: + return ( + f"" + ) + + def entries(self) -> Iterator[SimpleCacheFile]: + for file in self.cache_files: + yield SimpleCacheFile(self, file) + + def get_key(self, key: str) -> SimpleCacheFile | None: + """Return the first matching :class:`SimpleCacheFile` for the given ``key`` identifier.""" + for entry in self.entries(): + if entry.key == key: + return entry + return None + + def get_url(self, resource_url: str) -> SimpleCacheFile | None: + """Get the first matching :class:`SimpleCacheFile` for the given resource url.""" + for entry in self.entries(): + if resource_url == entry.resource_url: + return entry + return None + + def get_host(self, host: str) -> Iterator[SimpleCacheFile]: + """Get all :class:`CacheEntryStore` for the given host.""" + for entry in self.entries(): + if urlsplit(entry.resource_url).hostname == host: + yield entry + + +class SimpleIndexFile: + """Represents a Chromium Very Simple Disk Cache Backend index file.""" + + def __init__(self, disk_cache: SimpleDiskCache, path: Path): + self.disk_cache = disk_cache + self.path = path + + self.fh = path.open("rb") + self.header = c_simple.RealIndexHeader(self.fh) + + if self.header.magic != c_simple.kSimpleIndexMagicNumber: + raise ValueError(f"Unexpected magic header for {path!s}: {self.header.magic!r}") + + self.entries = self.header.entries + + if len(self.entries) != self.header.num_entries: + raise ValueError(f"Mismatch in amount of expected entries for {path!s}") + + self.last_used = webkittimestamp(self.entries[-1].last_used) + + def __repr__(self): + return f"" + + +class SimpleCacheFile: + """Represents a Chromium Very Simple Disk Cache Backend cache file. + + References: + - https://chromium.googlesource.com/chromium/src/+/HEAD/net/disk_cache/simple/simple_entry_format.h + - https://github.com/schorlet/simplecache + """ + + def __init__(self, disk_cache: SimpleDiskCache, path: Path): + self.disk_cache = disk_cache + self.path = path + + self.fh = path.open("rb") + self.header = c_simple.SimpleFileHeader(self.fh) + self.header_size = len(self.header.dumps()) + self.type = infer_file_type(self.path.name) + + self.key = self.header.key.decode("latin1") + self.credential_key, self.upload_data_identifier, self.isolation_key, self.resource_url = parse_cache_key( + self.key + ) + + def __repr__(self) -> str: + return f"" + + def _streams(self) -> None: + """Parse the stream(s) of this Simple Cache File.""" + if self.type == SimpleFileType.STREAM_0_1: + # We read backwards in the file handle (stream 0 is positioned after stream 1). + + # Stream 0 + self.fh.seek(-c_simple.kSimpleEOFSize, os.SEEK_END) + eof0 = c_simple.SimpleFileEOF(self.fh) + + if eof0.magic != c_simple.kSimpleFinalMagicNumber: + raise ValueError(f"Invalid EOF0 magic header {eof0!r}") + + offset = -c_simple.kSimpleEOFSize - eof0.stream_size + if eof0.flags in (2, 3): + offset -= 32 + self.fh.seek(offset, os.SEEK_END) + self._meta = self.fh.read(eof0.stream_size) + + # Stream 1 + self.fh.seek(-(c_simple.kSimpleEOFSize * 2) - eof0.stream_size, os.SEEK_END) + if eof0.flags in (2, 3): + self.fh.seek(-32, os.SEEK_CUR) + + eof1_offset = self.fh.tell() + eof1 = c_simple.SimpleFileEOF(self.fh) + + if eof1.magic != c_simple.kSimpleFinalMagicNumber: + raise ValueError(f"Invalid EOF1 magic header {eof1!r}") + + # Some EOF markers have a stream_size of 0x0 while the data is resident, this is intended behavior + # according to source, but older Chromium versions did populate stream_size. + # We can determine the size of stream 1 by reading until the beginning of the EOF marker for stream 1. + stream_size = eof1.stream_size or (eof1_offset - self.header_size) + + self.fh.seek(self.header_size) + self._data = self.fh.read(stream_size) + + elif self.type == SimpleFileType.STREAM_2: + # Should be simple + raise NotImplementedError + + elif self.type == SimpleFileType.STREAM_SPARSE: + ranges = [] + while True: + try: + range_header = c_simple.SimpleFileSparseRangeHeader(self.fh) + except EOFError: + break + + if range_header.magic != c_simple.kSimpleSparseRangeMagicNumber: + break + + offset = self.fh.tell() + ranges.append((range_header, offset)) + self.fh.seek(offset + range_header.length) + + if len(ranges) > 1: + raise ValueError("Did not expect another range in sparse stream") + + for range_header, offset in ranges: + self.fh.seek(offset) + self._meta = b"" + self._data = self.fh.read(range_header.length) + + @property + def meta(self) -> bytes: + if not hasattr(self, "_meta"): + self._streams() + return self._meta + + @property + def data(self) -> bytes: + if not hasattr(self, "_data"): + self._streams() + + if self._data[0:2] == b"\x1f\x8b": + return gzip.decompress(self._data) + + if b"content-encoding:br" in self.meta: + if not HAS_CRAMJAM: + raise RuntimeError("Missing required dependency cramjam to decode brotli data") + + return brotli.decompress(self._data).read() + + if b"content-encoding:deflate" in self.meta: + return zlib.decompress(self._data, -zlib.MAX_WBITS) + + return self._data + + +class SimpleFileType(IntEnum): + """SimpleFileType enum.""" + + STREAM_0_1 = 0 + STREAM_2 = 1 + STREAM_SPARSE = 2 + + +def infer_file_type(file_name: str) -> SimpleFileType: + """Infer the :class:`SimpleFileType` based on the name of the :class:`SimpleCacheFile`.""" + if file_name.endswith("_0"): + return SimpleFileType.STREAM_0_1 + + if file_name.endswith("_1"): + return SimpleFileType.STREAM_2 + + if file_name.endswith("_s"): + return SimpleFileType.STREAM_SPARSE + + raise ValueError(f"Unknown SimpleFileType for filename {file_name!r}") diff --git a/dissect/database/chromium/cache/util.py b/dissect/database/chromium/cache/util.py new file mode 100644 index 0000000..70a7b8f --- /dev/null +++ b/dissect/database/chromium/cache/util.py @@ -0,0 +1,52 @@ +from __future__ import annotations + +import re + + +def parse_cache_key(key: str) -> tuple[int | None, int | None, bool, str]: + """Parse a Cache or Simple Cache key to a standardized tuple. + + Arguments: + key: string in the format 'credential_key/upload_data_identifier/[isolation_key]url' + + Returns: Tuple of ``credential_key``, ``upload_data_identifier``, ``isolation_key`` and ``resource_url`` + + References: + - GenerateCacheKey + - GetResourceURLFromHttpCacheKey + - https://chromium.googlesource.com/chromium/src/+/main/net/http/http_cache.cc + """ + kDoubleKeyPrefix = "_dk_" + kDoubleKeySeparator = " " + + credential_key = None + upload_data_identifier = None + isolation_key = False + url = None + + if not isinstance(key, str): + raise TypeError("Input key is not a string") + + # Key looks like 'credential_key/upload_data_identifier/...', after 2021-09 + if match := re.match(r"^(\d+)/(\d+)/(.+)", key): + credential_key = int(match.group(1)) + upload_data_identifier = int(match.group(2)) + url = match.group(3) + + # Key looks like 'upload_data_identifier/...', before 2021-09 + elif match := re.match(r"^(\d+)/(.+)", key): + upload_data_identifier = int(match.group(1)) + url = match.group(2) + + # Key could be a regular URL + else: + url = key + + # Check for double key presence in url. The last part is the resource url + if url.startswith(kDoubleKeyPrefix): + isolation_key = True + _, _, resource_url = url.rpartition(kDoubleKeySeparator) + else: + resource_url = url + + return credential_key, upload_data_identifier, isolation_key, resource_url diff --git a/pyproject.toml b/pyproject.toml index bd75c47..7f0a3e9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -37,7 +37,8 @@ repository = "https://github.com/fox-it/dissect.database" [project.optional-dependencies] full = [ - "pycryptodome" + "pycryptodome", + "cramjam", ] dev = [ "dissect.database[full]", diff --git a/tests/_data/chromium/cache/Linux_Cache_Data.tgz b/tests/_data/chromium/cache/Linux_Cache_Data.tgz new file mode 100644 index 0000000..df63024 --- /dev/null +++ b/tests/_data/chromium/cache/Linux_Cache_Data.tgz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e58f665133fb14f3ba4a9c9d128cc34cc8c3097e621386896bb18ecf7a3470e4 +size 976979 diff --git a/tests/_data/chromium/cache/Windows_Cache_Data.tgz b/tests/_data/chromium/cache/Windows_Cache_Data.tgz new file mode 100644 index 0000000..b5ac42d --- /dev/null +++ b/tests/_data/chromium/cache/Windows_Cache_Data.tgz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ab0f0821a0ea97dbe81ca5305789cd30436501686e9fa765850e3f8634e7d065 +size 6415601 diff --git a/tests/chromium/__init__.py b/tests/chromium/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/chromium/test_cache.py b/tests/chromium/test_cache.py new file mode 100755 index 0000000..6ec8cbf --- /dev/null +++ b/tests/chromium/test_cache.py @@ -0,0 +1,59 @@ +from __future__ import annotations + +import tarfile +from datetime import datetime, timezone +from typing import TYPE_CHECKING + +from dissect.database.chromium.cache.c_cache import c_cache +from dissect.database.chromium.cache.cache import DiskCache +from tests._util import absolute_path + +if TYPE_CHECKING: + from pathlib import Path + + +def test_chromium_cache(tmp_path: Path) -> None: + """Test if we can parse Chromium Cache Data from Google Chrome 148 on Windows 11 (24H2).""" + path = absolute_path("_data/chromium/cache/Windows_Cache_Data.tgz") + with tarfile.open(path) as tf: + tf.extractall(tmp_path, filter="data") + + disk_cache = DiskCache(tmp_path) + assert [entry.resource_url for entry in disk_cache.entries()] == [ + "http://172.16.82.1:8000/", + "http://172.16.82.1:8000/webfiles/1750011834072/presentation/responsive.css", + "http://172.16.82.1:8000/webfiles/1750011834072/presentation/themes/logoblauw.css", + "http://172.16.82.1:8000/webfiles/1750011834072/behaviour/core.js", + "http://172.16.82.1:8000/binaries/content/gallery/rijksoverheid/channel-afbeeldingen/logos/beeldmerk-rijksoverheid-desktop.svg", + "http://172.16.82.1:8000/binaries/content/assets/rijksoverheid/behaviour/rop-page-feedback.min-20230526.js", + "http://172.16.82.1:8000/binaries/widescreen/content/gallery/rijksoverheid/content-afbeeldingen/home/evergreens/header-meivakantie.jpg", + "http://172.16.82.1:8000/binaries/medium/content/gallery/rijksoverheid/content-afbeeldingen/home/2026/douane-koraal-1.jpg", + "http://172.16.82.1:8000/binaries/medium/content/gallery/rijksoverheid/content-afbeeldingen/home/2026/energiemaatregelen-anp-556197185.jpg", + "http://172.16.82.1:8000/binaries/medium/content/gallery/rijksoverheid/content-afbeeldingen/onderwerpen/fiets/campagne-fietshelm.jpg", + "http://172.16.82.1:8000/webfiles/1750011834072/presentation/shared-ro/webfonts/RO-SansWebText-Regular.woff2", + "http://172.16.82.1:8000/webfiles/1750011834072/presentation/shared-ro/webfonts/rijks-sans-regular.woff2", + "http://172.16.82.1:8000/webfiles/1750011834072/presentation/shared-ro/webfonts/RO-SansWebText-Bold.woff2", + "http://172.16.82.1:8000/webfiles/1750011834072/presentation/shared-ro/icons/ro-icons-2.4.woff2", + "http://172.16.82.1:8000/webfiles/1750011834072/presentation/shared-ro/webfonts/RO-SerifWeb-Italic.woff2", + "http://172.16.82.1:8000/webfiles/1750011834072/behaviour/shared-ro/jquery-ui.js", + "http://172.16.82.1:8000/webfiles/1750011834072/behaviour/shared-ro/img-helpers.js", + "http://172.16.82.1:8000/binaries/content/assets/rijksoverheid/iconen/favicon.ico", + ] + + assert disk_cache.create_time == datetime(2026, 4, 30, 12, 10, 45, 77412, tzinfo=timezone.utc) + assert disk_cache.num_entries == 1 + assert len(disk_cache.block_files) == 4 + + entry_store = next(disk_cache.entries()) + assert entry_store.address.address == 0xA0010002 + assert entry_store.state == c_cache.EntryState.ENTRY_NORMAL + assert entry_store.creation_time == datetime(2026, 4, 30, 12, 11, 48, 207695, tzinfo=timezone.utc) + assert entry_store.key == "1/0/_dk_http://172.16.82.1 http://172.16.82.1 http://172.16.82.1:8000/" + assert entry_store.next == 0 + + assert entry_store.data.startswith(b"\n\n") + assert b"HTTP/1.0 200 OK\00" in entry_store.meta + + assert disk_cache.get_key("1/0/_dk_http://172.16.82.1 http://172.16.82.1 http://172.16.82.1:8000/") + assert disk_cache.get_url("http://172.16.82.1:8000/") + assert next(disk_cache.get_host("172.16.82.1")) diff --git a/tests/chromium/test_simple.py b/tests/chromium/test_simple.py new file mode 100755 index 0000000..ef31eac --- /dev/null +++ b/tests/chromium/test_simple.py @@ -0,0 +1,55 @@ +from __future__ import annotations + +import tarfile +from typing import TYPE_CHECKING + +from dissect.database.chromium.cache.simple import SimpleDiskCache +from tests._util import absolute_path + +if TYPE_CHECKING: + from pathlib import Path + + +def test_chromium_simple_cache(tmp_path: Path) -> None: + """Test if we can parse Chromium Cache Data from Google Chrome 147 on Ubuntu 24.04 LTS.""" + path = absolute_path("_data/chromium/cache/Linux_Cache_Data.tgz") + + with tarfile.open(path) as tf: + tf.extractall(tmp_path, filter="data") + + simple_disk_cache = SimpleDiskCache(tmp_path) + + assert len(simple_disk_cache.cache_files) == 19 + assert len(list(simple_disk_cache.get_host("172.16.82.1"))) == 19 + + assert sorted(cache_file.resource_url for cache_file in simple_disk_cache.entries()) == sorted( + [ + "http://172.16.82.1:8000/webfiles/1750011834072/presentation/shared-ro/webfonts/RO-SerifWeb-Italic.woff2", + "http://172.16.82.1:8000/binaries/medium/content/gallery/rijksoverheid/content-afbeeldingen/home/2026/energiemaatregelen-anp-556197185.jpg", + "http://172.16.82.1:8000/webfiles/1750011834072/presentation/responsive.css", + "http://172.16.82.1:8000/binaries/content/assets/rijksoverheid/iconen/favicon.ico", + "http://172.16.82.1:8000/webfiles/1750011834072/presentation/shared-ro/webfonts/RO-SansWebText-Regular.woff2", + "http://172.16.82.1:8000/webfiles/1750011834072/behaviour/shared-ro/jquery-ui.js", + "http://172.16.82.1:8000/webfiles/1750011834072/presentation/shared-ro/icons/ro-icons-2.4.woff2", + "http://172.16.82.1:8000/binaries/medium/content/gallery/rijksoverheid/content-afbeeldingen/home/2026/douane-koraal-1.jpg", + "http://172.16.82.1:8000/binaries/content/gallery/rijksoverheid/channel-afbeeldingen/logos/beeldmerk-rijksoverheid-desktop.svg", + "http://172.16.82.1:8000/binaries/content/assets/rijksoverheid/behaviour/rop-page-feedback.min-20230526.js", + "http://172.16.82.1:8000/binaries/widescreen/content/gallery/rijksoverheid/content-afbeeldingen/home/evergreens/header-meivakantie.jpg", + "http://172.16.82.1:8000/webfiles/1750011834072/behaviour/core.js", + "http://172.16.82.1:8000/binaries/medium/content/gallery/rijksoverheid/content-afbeeldingen/onderwerpen/fiets/campagne-fietshelm.jpg", + "http://172.16.82.1:8000/webfiles/1750011834072/behaviour/shared-ro/img-helpers.js", + "http://172.16.82.1:8000/webfiles/1750011834072/presentation/themes/logoblauw.css", + "http://172.16.82.1:8000/webfiles/1750011834072/presentation/shared-ro/webfonts/rijks-sans-regular.woff2", + "http://172.16.82.1:8000/", + "http://172.16.82.1:8000/binaries/large/content/gallery/rijksoverheid/content-afbeeldingen/home/evergreens/header-meivakantie.jpg", + "http://172.16.82.1:8000/webfiles/1750011834072/presentation/shared-ro/webfonts/RO-SansWebText-Bold.woff2", + ] + ) + + cache_file = simple_disk_cache.get_url("http://172.16.82.1:8000/") + assert b"HTTP/1.0 200 OK\x00" in cache_file.meta + assert cache_file.data.startswith(b"") + assert cache_file.data.endswith(b"\n\n\n") + assert len(cache_file.data) == 25451 + + assert simple_disk_cache.get_key("1/0/_dk_http://172.16.82.1 http://172.16.82.1 http://172.16.82.1:8000/") diff --git a/tests/chromium/test_util.py b/tests/chromium/test_util.py new file mode 100644 index 0000000..b2ceac4 --- /dev/null +++ b/tests/chromium/test_util.py @@ -0,0 +1,30 @@ +from __future__ import annotations + +import pytest + +from dissect.database.chromium.cache.util import parse_cache_key + + +@pytest.mark.parametrize( + ("input_key", "expected_output"), + [ + pytest.param( + "http://172.16.82.1", + (None, None, False, "http://172.16.82.1"), + id="regular_url", + ), + pytest.param( + "1/0/_dk_http://172.16.82.1 http://172.16.82.1 http://172.16.82.1:8000/webfiles/1750011834072/behaviour/shared-ro/jquery-ui.js", + (1, 0, True, "http://172.16.82.1:8000/webfiles/1750011834072/behaviour/shared-ro/jquery-ui.js"), + id="double_keyed_key", + ), + pytest.param( + "0/http://172.16.82.1", + (None, 0, False, "http://172.16.82.1"), + id="old_format", + ), + ], +) +def test_cache_key_parsing(input_key: str, expected_output: tuple) -> None: + """Test if we parse Chromium cache keys correctly.""" + assert parse_cache_key(input_key) == expected_output