参考资料

chunk

在一个 block 的 chunks/目录下创建的。
每个段文件的最大尺寸是 512MB。
文件中的块是通过由文件内偏移量（低 4 个字节）和段序列号（高 4 个字节）组成的 uint64 来引用索引的。

┌──────────────────────────────┐
│  magic(0x85BD40DD) <4 byte>  │
├──────────────────────────────┤
│    version(1) <1 byte>       │
├──────────────────────────────┤
│    padding(0) <3 byte>       │
├──────────────────────────────┤
│ ┌──────────────────────────┐ │
│ │         Chunk 1          │ │
│ ├──────────────────────────┤ │
│ │          ...             │ │
│ ├──────────────────────────┤ │
│ │         Chunk N          │ │
│ └──────────────────────────┘ │
└──────────────────────────────┘

┌───────────────┬───────────────────┬──────────────┬────────────────┐
│ len <uvarint> │ encoding <1 byte> │ data <bytes> │ CRC32 <4 byte> │
└───────────────┴───────────────────┴──────────────┴────────────────┘

读 WAL 文件

// ┌───────────┬──────────┬────────────┬──────────────┐
// │ type <1b> │ len <2b> │ CRC32 <4b> │ data <bytes> │
// └───────────┴──────────┴────────────┴──────────────┘

func (r *Reader) next() (err error) {
    // We have to use r.buf since allocating byte arrays here fails escape
    // analysis and ends up on the heap, even though it seemingly should not.
    hdr := r.buf[:recordHeaderSize]
    // 数据部分
    buf := r.buf[recordHeaderSize:]

    // 清空
    r.rec = r.rec[:0]
    // snappy 算法
    r.snappyBuf = r.snappyBuf[:0]

    i := 0
    for {
        // 获取第一个字节
        if _, err = io.ReadFull(r.rdr, hdr[:1]); err != nil {
            return errors.Wrap(err, "read first header byte")
        }
        r.total++
        r.curRecTyp = recTypeFromHeader(hdr[0])
        // 是否被压缩
        compressed := hdr[0]&snappyMask != 0

        // Gobble up zero bytes.
        if r.curRecTyp == recPageTerm {
            // recPageTerm is a single byte that indicates the rest of the page is padded.
            // If it's the first byte in a page, buf is too small and
            // needs to be resized to fit pageSize-1 bytes.
            buf = r.buf[1:]

            // We are pedantic and check whether the zeros are actually up
            // to a page boundary.
            // It's not strictly necessary but may catch sketchy state early.
            k := pageSize - (r.total % pageSize)
            if k == pageSize {
                continue // Initial 0 byte was last page byte.
            }
            n, err := io.ReadFull(r.rdr, buf[:k])
            if err != nil {
                return errors.Wrap(err, "read remaining zeros")
            }
            r.total += int64(n)

            for _, c := range buf[:k] {
                if c != 0 {
                    return errors.New("unexpected non-zero byte in padded page")
                }
            }
            continue
        }

        // 剩下的部分
        n, err := io.ReadFull(r.rdr, hdr[1:])
        if err != nil {
            return errors.Wrap(err, "read remaining header")
        }
        // 整个读取真正的数据
        r.total += int64(n)

        var (
            // 长度，2 个字节
            length = binary.BigEndian.Uint16(hdr[1:])
            // crc, 4 个字节
            crc = binary.BigEndian.Uint32(hdr[3:])
        )

        // 记录的 record 大于 一页
        if length > pageSize-recordHeaderSize {
            return errors.Errorf("invalid record size %d", length)
        }
        // 读取数据部分
        n, err = io.ReadFull(r.rdr, buf[:length])
        if err != nil {
            return err
        }
        // 真正读取部分
        r.total += int64(n)

        if n != int(length) {
            return errors.Errorf("invalid size: expected %d, got %d", length, n)
        }
        // 计算数据的 hash 值
        if c := crc32.Checksum(buf[:length], castagnoliTable); c != crc {
            return errors.Errorf("unexpected checksum %x, expected %x", c, crc)
        }

        if compressed {
            // 压缩过
            r.snappyBuf = append(r.snappyBuf, buf[:length]...)
        } else {
            // 数据部分
            r.rec = append(r.rec, buf[:length]...)
        }

        // 开始验证数据部分
        // 数据 type 的类型
        if err := validateRecord(r.curRecTyp, i); err != nil {
            return err
        }
        if r.curRecTyp == recLast || r.curRecTyp == recFull {
            if compressed && len(r.snappyBuf) > 0 {
                // The snappy library uses `len` to calculate if we need a new buffer.
                // In order to allocate as few buffers as possible make the length
                // equal to the capacity.
                r.rec = r.rec[:cap(r.rec)]
                r.rec, err = snappy.Decode(r.rec, r.snappyBuf)
                return err
            }
            return nil
        }

        // Only increment i for non-zero records since we use it
        // to determine valid content record sequences.
        i++
    }
}

包含头信息和数据信息，头中包含了类别，长度 crc 校验和。
注意一个 WAL 文件，默认是128MB，也就是一个 segment 的大小，
一条 WAL record 记录是 32KB。
在存 record 的记录的时候，不足 32KB 的记录按照一条来存，多余 record 的记录，那就分多页，这多个页，
包含的头 type 也不同，具体的 type 取值为：
- 0: rest of page will be empty
- 1: a full record encoded in a single fragment
- 2: first fragment of a record
- 3: middle fragment of a record
- 4: final fragment of a record

recType

const (
    recPageTerm recType = 0 // Rest of page is empty.
    recFull     recType = 1 // Full record.
    recFirst    recType = 2 // First fragment of a record.
    recMiddle   recType = 3 // Middle fragments of a record.
    recLast     recType = 4 // Final fragment of a record.
)

series 类型的 record

┌────────────────────────────────────────────┐
│ type = 1 <1b>                              │
├────────────────────────────────────────────┤
│ ┌─────────┬──────────────────────────────┐ │
│ │ id <8b> │ n = len(labels) <uvarint>    │ │
│ ├─────────┴────────────┬─────────────────┤ │
│ │ len(str_1) <uvarint> │ str_1 <bytes>   │ │
│ ├──────────────────────┴─────────────────┤ │
│ │  ...                                   │ │
│ ├───────────────────────┬────────────────┤ │
│ │ len(str_2n) <uvarint> │ str_2n <bytes> │ │
│ └───────────────────────┴────────────────┘ │
│                  . . .                     │
└────────────────────────────────────────────┘

类别，然后是 series 的 id 号，接着是该 series 对应的 label 的数量，最后是 label 的 name 和 value。

其中 type 的可能取值为：

    // Unknown is returned for unrecognised WAL record types.
    Unknown Type = 255
    // Series is used to match WAL records of type Series.
    Series Type = 1
    // Samples is used to match WAL records of type Samples.
    Samples Type = 2
    // Tombstones is used to match WAL records of type Tombstones.
    Tombstones Type = 3
    // Exemplars is used to match WAL records of type Exemplars.
    Exemplars Type = 4

其中，如果有多条，那么接下来的还是有 id, label 等同样的数据结构

func (d *Decoder) Series(rec []byte, series []RefSeries) ([]RefSeries, error) {
    dec := encoding.Decbuf{B: rec}

    // record 类型不是 series
    if Type(dec.Byte()) != Series {
        return nil, errors.New("invalid record type")
    }
    for len(dec.B) > 0 && dec.Err() == nil {
        // 获取该 series 的 id
        ref := storage.SeriesRef(dec.Be64())

        lset := make(labels.Labels, dec.Uvarint())

        for i := range lset {
            lset[i].Name = dec.UvarintStr()
            lset[i].Value = dec.UvarintStr()
        }
        sort.Sort(lset)

        series = append(series, RefSeries{
            Ref:    chunks.HeadSeriesRef(ref),
            Labels: lset,
        })
    }
    if dec.Err() != nil {
        return nil, dec.Err()
    }
    // 还剩余数据
    if len(dec.B) > 0 {
        return nil, errors.Errorf("unexpected %d bytes left in entry", len(dec.B))
    }
    return series, nil
}
// Series appends the encoded series to b and returns the resulting slice.
func (e *Encoder) Series(series []RefSeries, b []byte) []byte {
    // buffer
    buf := encoding.Encbuf{B: b}
    // 设置 type
    buf.PutByte(byte(Series))

    for _, s := range series {
        // 设置 series id
        buf.PutBE64(uint64(s.Ref))
        buf.PutUvarint(len(s.Labels))

        for _, l := range s.Labels {
            buf.PutUvarintStr(l.Name)
            buf.PutUvarintStr(l.Value)
        }
    }
    // 获取序列化后的 byte
    return buf.Get()
}

WAL disk 磁盘格式

┌───────────┬──────────┬────────────┬──────────────┐
│ type <1b> │ len <2b> │ CRC32 <4b> │ data <bytes> │
└───────────┴──────────┴────────────┴──────────────┘

实现见保存到磁盘的数据结构

index 文件格式

┌────────────────────────────┬─────────────────────┐
│ magic(0xBAAAD700) <4b>     │ version(1) <1 byte> │
├────────────────────────────┴─────────────────────┤
│ ┌──────────────────────────────────────────────┐ │
│ │                 Symbol Table                 │ │
│ ├──────────────────────────────────────────────┤ │
│ │                    Series                    │ │
│ ├──────────────────────────────────────────────┤ │
│ │                 Label Index 1                │ │
│ ├──────────────────────────────────────────────┤ │
│ │                      ...                     │ │
│ ├──────────────────────────────────────────────┤ │
│ │                 Label Index N                │ │
│ ├──────────────────────────────────────────────┤ │
│ │                   Postings 1                 │ │
│ ├──────────────────────────────────────────────┤ │
│ │                      ...                     │ │
│ ├──────────────────────────────────────────────┤ │
│ │                   Postings N                 │ │
│ ├──────────────────────────────────────────────┤ │
│ │               Label Offset Table             │ │
│ ├──────────────────────────────────────────────┤ │
│ │             Postings Offset Table            │ │
│ ├──────────────────────────────────────────────┤ │
│ │                      TOC                     │ │
│ └──────────────────────────────────────────────┘ │
└──────────────────────────────────────────────────┘

下面描述的大部分部分都以 len 字段开始。
它总是指定尾部 CRC32 校验和之前的字节数。
校验和总是通过这些 len 字节来计算。
在 index 文件中，所以超过 len 部分，到下一个 len 开始中间到 0，都能忽略

symbol table

这里记录的是 series 中的label name 和 value 涉及到的名称和值，
记录后，后面要使用的时候，直接使用 ref 来标记，节省了大量的空间。

┌────────────────────┬─────────────────────┐
│ len <4b>           │ #symbols <4b>       │
├────────────────────┴─────────────────────┤
│ ┌──────────────────────┬───────────────┐ │
│ │ len(str_1) <uvarint> │ str_1 <bytes> │ │
│ ├──────────────────────┴───────────────┤ │
│ │                . . .                 │ │
│ ├──────────────────────┬───────────────┤ │
│ │ len(str_n) <uvarint> │ str_n <bytes> │ │
│ └──────────────────────┴───────────────┘ │
├──────────────────────────────────────────┤
│ CRC32 <4b>                               │
└──────────────────────────────────────────┘

series

┌───────────────────────────────────────┐
│ ┌───────────────────────────────────┐ │
│ │   series_1                        │ │
│ ├───────────────────────────────────┤ │
│ │                 . . .             │ │
│ ├───────────────────────────────────┤ │
│ │   series_n                        │ │
│ └───────────────────────────────────┘ │
└───────────────────────────────────────┘

该部分包含一个系列的 series，该序列持有 series 到 label set 以及它在 blocks 中的 chunk。
这些 series 按其 label set 进行逐级排序。
每个系列的条目首先是它的 label 数量，然后是包含标签名称和值的符号表引用的元祖（tuple）。
label pair 是按词汇表排序的。
在标签之后，indexed chunks 的数量被编码，然后是一连串的 metadata entries，其中包含块的最小（mint）和最大（maxt）时间戳，以及它在块文件中的位置参考。
mint 是第一个样本的时间，maxt 是该块中最后一个样本的时间。
将时间范围数据保存在索引中，允许放弃与查询的时间范围无关的块，而不直接访问它们。
第一个 chunk 的 mint 被存储，它的 maxt 被存储为 delta，mint 和 maxt 被编码为后续块的前一个时间的 delta。
同样地，第一个块的引用被存储，下一个引用被存储为与前一个的 delta。

格式

┌──────────────────────────────────────────────────────────────────────────┐
│ len <uvarint>                                                            │
├──────────────────────────────────────────────────────────────────────────┤
│ ┌──────────────────────────────────────────────────────────────────────┐ │
│ │                     labels count <uvarint64>                         │ │
│ ├──────────────────────────────────────────────────────────────────────┤ │
│ │              ┌────────────────────────────────────────────┐          │ │
│ │              │ ref(l_i.name) <uvarint32>                  │          │ │
│ │              ├────────────────────────────────────────────┤          │ │
│ │              │ ref(l_i.value) <uvarint32>                 │          │ │
│ │              └────────────────────────────────────────────┘          │ │
│ │                             ...                                      │ │
│ ├──────────────────────────────────────────────────────────────────────┤ │
│ │                     chunks count <uvarint64>                         │ │
│ ├──────────────────────────────────────────────────────────────────────┤ │
│ │              ┌────────────────────────────────────────────┐          │ │
│ │              │ c_0.mint <varint64>                        │          │ │
│ │              ├────────────────────────────────────────────┤          │ │
│ │              │ c_0.maxt - c_0.mint <uvarint64>            │          │ │
│ │              ├────────────────────────────────────────────┤          │ │
│ │              │ ref(c_0.data) <uvarint64>                  │          │ │
│ │              └────────────────────────────────────────────┘          │ │
│ │              ┌────────────────────────────────────────────┐          │ │
│ │              │ c_i.mint - c_i-1.maxt <uvarint64>          │          │ │
│ │              ├────────────────────────────────────────────┤          │ │
│ │              │ c_i.maxt - c_i.mint <uvarint64>            │          │ │
│ │              ├────────────────────────────────────────────┤          │ │
│ │              │ ref(c_i.data) - ref(c_i-1.data) <varint64> │          │ │
│ │              └────────────────────────────────────────────┘          │ │
│ │                             ...                                      │ │
│ └──────────────────────────────────────────────────────────────────────┘ │
├──────────────────────────────────────────────────────────────────────────┤
│ CRC32 <4b>                                                               │
└──────────────────────────────────────────────────────────────────────────┘

label index

索引 label 的名称和可能取值，#names 是 lable name 的个数，#entries 是可能取值的个数

┌───────────────┬────────────────┬────────────────┐
│ len <4b>      │ #names <4b>    │ #entries <4b>  │
├───────────────┴────────────────┴────────────────┤
│ ┌─────────────────────────────────────────────┐ │
│ │ ref(value_0) <4b>                           │ │
│ ├─────────────────────────────────────────────┤ │
│ │ ...                                         │ │
│ ├─────────────────────────────────────────────┤ │
│ │ ref(value_n) <4b>                           │ │
│ └─────────────────────────────────────────────┘ │
│                      . . .                      │
├─────────────────────────────────────────────────┤
│ CRC32 <4b>                                      │
└─────────────────────────────────────────────────┘

比如 1 个具有一个 label 名称，和这 4 个 lable 可能取值的格式如下：4 个可能取值的格式如下

┌────┬───┬───┬──────────────┬──────────────┬──────────────┬──────────────┬───────┐
│ 24 │ 1 │ 4 │ ref(value_0) | ref(value_1) | ref(value_2) | ref(value_3) | CRC32 |
└────┴───┴───┴──────────────┴──────────────┴──────────────┴──────────────┴───────┘

其中 24 表示距离 CRC 校验和的 offset。
1 表示当前 label name 的个数，4 表示 name 的可能取值。
ref 则是一个元组，其中一个是 name 在 symbol table 中的偏移量，另一个是 value 也是在 symbol table 中的偏移量。

posting

posting 的概念，能理解成 series，是一个模板。

其格式为

┌────────────────────┬────────────────────┐
│ len <4b>           │ #entries <4b>      │
├────────────────────┴────────────────────┤
│ ┌─────────────────────────────────────┐ │
│ │ ref(series_1) <4b>                  │ │
│ ├─────────────────────────────────────┤ │
│ │ ...                                 │ │
│ ├─────────────────────────────────────┤ │
│ │ ref(series_n) <4b>                  │ │
│ └─────────────────────────────────────┘ │
├─────────────────────────────────────────┤
│ CRC32 <4b>                              │
└─────────────────────────────────────────┘

entries 表示这些 posting 的条数，这个 posing section 需要和 posting offset table 来结合使用，offset table 中的值指向了 posting 中的一个条目

posing offset table

用来实例化一个 series 的部分

┌─────────────────────┬──────────────────────┐
│ len <4b>            │ #entries <4b>        │
├─────────────────────┴──────────────────────┤
│ ┌────────────────────────────────────────┐ │
│ │  n = 2 <1b>                            │ │
│ ├──────────────────────┬─────────────────┤ │
│ │ len(name) <uvarint>  │ name <bytes>    │ │
│ ├──────────────────────┼─────────────────┤ │
│ │ len(value) <uvarint> │ value <bytes>   │ │
│ ├──────────────────────┴─────────────────┤ │
│ │  offset <uvarint64>                    │ │
│ └────────────────────────────────────────┘ │
│                    . . .                   │
├────────────────────────────────────────────┤
│  CRC32 <4b>                                │
└────────────────────────────────────────────┘

这里面的 offset 就是指向 posting 的对应的偏移量

┌─────────────────────────────────────────┐
│ ref(symbols) <8b>                       │
├─────────────────────────────────────────┤
│ ref(series) <8b>                        │
├─────────────────────────────────────────┤
│ ref(label indices start) <8b>           │
├─────────────────────────────────────────┤
│ ref(label offset table) <8b>            │
├─────────────────────────────────────────┤
│ ref(postings start) <8b>                │
├─────────────────────────────────────────┤
│ ref(postings offset table) <8b>         │
├─────────────────────────────────────────┤
│ CRC32 <4b>                              │
└─────────────────────────────────────────┘

type/mysql #type/golang #public

BruceChen7 / gitblog

prometheus磁盘文件格式理解 #77

参考资料

chunk

读 WAL 文件

recType

series 类型的 record

WAL disk 磁盘格式

index 文件格式

symbol table

series

格式

label index

posting

posing offset table

TOC

type/mysql #type/golang #public