stage0: improve documentation

This commit is contained in:
anna 2023-06-15 15:44:56 +02:00
parent 326997330b
commit ede142aed7
Signed by: fef
GPG key ID: 2585C2DC6D79B485

View file

@ -37,7 +37,7 @@
* In the case of bussy, we do ALL of the following, in that order: * In the case of bussy, we do ALL of the following, in that order:
* *
* 1. relocate ourselves to the end of low memory (0x7800:7c00 = 0x7fc00) * 1. relocate ourselves to the end of low memory (0x7800:7c00 = 0x7fc00)
* 2. do some rudimentary sanity checks * 2. do some rudimentary "sanity" checks
* 3. read the boot drive's GPT header and validate its magic and CRC * 3. read the boot drive's GPT header and validate its magic and CRC
* 4. read the boot drive's GPT and validate its CRC * 4. read the boot drive's GPT and validate its CRC
* 5. search the GPT for the partition containing stage1 * 5. search the GPT for the partition containing stage1
@ -47,14 +47,24 @@
* *
* What's worse, all of that has to happen in Real Mode x86 assembly. Ugh. * What's worse, all of that has to happen in Real Mode x86 assembly. Ugh.
* And account for all the stupid BIOS bugs since the 80s. Double ugh. * And account for all the stupid BIOS bugs since the 80s. Double ugh.
* These constraints force us to resort to some ... advanced space saving
* techniques, resulting in rather obfuscated code:
* *
* These constraints force us to resort to some advanced space saving techniques * - Every subroutine has a completely custom ABI that was carefully chosen to
* because literally every single byte counts. As such, we reuse register * minimize register saves in the contexts from which they are called.
* values and side effects, replace instructions that take up multiple bytes * - A lot of stuff relies heavily on side effects/leftover values from previous
* with smaller ones, combine multiple operations into one, reorder code to be * operations, including registers that stay untouched for a long time.
* as compact as humanly possible, and so much more. At the very least, any CPU * - The code is arranged in a very specific way in order to maximize the number
* manufactured in this millennium will easily crunch through all of this within * of jumps with 1-byte relative addressing.
* a matter of milliseconds at most, so performance is a non-issue. * - Some instructions are replaced with similar ones that have additional side
* effects (e.g. `xor` instead of `cmp` if we need 0 in that register anyway).
* - Other instructions are replaced with similar ones that take up less bytes.
*
* We also don't have room for any meaningful error messages, although we do
* compensate for that with error *codes* that i will hopefully document at
* some point (see error_slide). At the very least, any CPU manufactured in
* this millennium will easily crunch through all of this within a matter of
* milliseconds at most, so performance is a total non-issue.
* *
* I tried to keep this code itself location-agnostic and only work with * I tried to keep this code itself location-agnostic and only work with
* addresses defined in the linker script, but there are several places where * addresses defined in the linker script, but there are several places where
@ -81,7 +91,8 @@
* OpenZFS is even nice enough to write a GPT to them when creating a pool. * OpenZFS is even nice enough to write a GPT to them when creating a pool.
* That table contains two entries; one for the ZFS stuff itself and one for * That table contains two entries; one for the ZFS stuff itself and one for
* 8 MiB of reserved space that ZFS appears to just leave alone and not touch * 8 MiB of reserved space that ZFS appears to just leave alone and not touch
* at all. 8 MiB are way more than we'll ever need, * at all. 8 MiB are probably way more than we'll ever need, especially since
* we only need to store the core image and ZFS driver there.
*/ */
.code16 .code16
@ -136,7 +147,7 @@ GLOBL _start
/* %di should really be 0x7c00 (we're setting it to 0x7800 here), but a /* %di should really be 0x7c00 (we're setting it to 0x7800 here), but a
* `mov` between two registers is 1 byte smaller than one with an imm16 */ * `mov` between two registers is 1 byte smaller than one with an imm16 */
mov %bx, %di mov %bx, %di
mov $3, %ch /* 0x300 words (0x7800 to 0x7e00) */ mov $3, %ch /* 0x300 _words_ (0x7800 to 0x7e00) */
1: mov (%di), %ax /* use %ds (0x0000) for reading */ 1: mov (%di), %ax /* use %ds (0x0000) for reading */
stosw /* use %es (0x7800) for writing */ stosw /* use %es (0x7800) for writing */
loop 1b loop 1b
@ -165,10 +176,10 @@ GLOBL _start
mov $msg_loader_info, %si mov $msg_loader_info, %si
call print call print
/* we kinda abuse %bp, see the "error slide" below */ /* we kinda abuse %bp, see error_slide below */
mov $'1', %bp mov $'1', %bp
/* check and save our boot drive number */ /* check our boot drive number */
test $0x80, %dl /* only accept drives 0x80-0xff */ test $0x80, %dl /* only accept drives 0x80-0xff */
jz err_bad_boot_drive jz err_bad_boot_drive
@ -205,12 +216,18 @@ GLOBL _start
/* /*
* Step 4: validate the GPT's magic number and CRCs * Step 4: validate the GPT's magic number and CRCs
*
* XXX The osdev wiki (somewhat vaguely) states that reserved bytes in
* the GPT header should _not_ be included when calculating the CRC.
* This is true for anything from offset 0x5c to the end of the
* sector, but the 4 bytes at 0x14 _are_ included. At least fdisk
* does that; it didn't work when i omitted them.
*/ */
/* we'll work with %fs quite a lot over the next lines, so much in fact /* we'll work with %fs quite a lot over the next lines, so much in fact
* that copying it to %ds and saving the segment overrides is worth it */ * that copying it to %ds and saving the segment overrides is worth it */
push %fs push %fs
pop %ds pop %ds /* %ds = 0x0500 */
/* check if the GPT signature is correct (%di points to sector) */ /* check if the GPT signature is correct (%di points to sector) */
xor %si, %si xor %si, %si
@ -220,12 +237,12 @@ GLOBL _start
jne err_no_gpt jne err_no_gpt
dec %ax /* %eax = 0 */ dec %ax /* %eax = 0 */
xchg 0x08(%si), %eax /* load CRC and replace it with zeroes */ xchg 0x08(%si), %eax /* load expected CRC and replace it with zeroes */
push %ax /* save CRC[15:0] */ /* '1 v */ push %ax /* save expected CRC[15:0] */ /* '1 v */
xor %si, %si xor %si, %si
mov $0x5c, %cl mov $0x5c, %cl
call crc32 /* %ebx = ~CRC, CF = 1 */ call crc32 /* %ebx = ~CRC, CF = 1 */
pop %ax /* restore CRC[15:0] */ /* '1 ^ */ pop %ax /* restore expected CRC[15:0] */ /* '1 ^ */
adc %eax, %ebx /* check CRC and set %ebx = 0 if equal */ adc %eax, %ebx /* check CRC and set %ebx = 0 if equal */
jne err_bad_gpt_csum jne err_bad_gpt_csum
@ -459,16 +476,19 @@ END read_lba
/* /*
* Calculate the CCITT32 ANSI CRC (polynomial 0x04c11db7) of %cx bytes * Calculate the CCITT32 ANSI CRC (polynomial 0x04c11db7) of %cx bytes
* at %fs:%si and return THE COMPLEMENT of the result in %ebx. * at %fs:%si and return THE COMPLEMENT of the result in %ebx.
* Clobbers %al, %cx, and %si. Does not check for overflows (%si + %cx * Clobbers %al, %cx, and %si. Does NOT check for overflows (%si + %cx
* must be <= 0x10000). If %cx is 0, the size is 64 KiB (0x10000), in * must be <= 0x10000). If %cx is 0, the size is 64 KiB (0x10000), in
* which case %si should also be 0 because of the missing wrap check. * which case %si should also be 0 because of the missing wrap check.
* This also sets CF = 1 because that allows for comparing the CRC with
* the expected value through a single `adc` instruction.
* *
* Stolen from "Hacker's Delight", second edition by Henry S. Warren, Jr. * This also sets CF = 1, which we use for checking the CRC with a
* and painstakingly ported to x86 assembly with focus on minimum size. * single `adc` instruction (x + ~x + 1 = 0 because that's how 2's
* I have zero clue how CRC *actually* works, so there may be room for * complement works). `stc` is only 1 byte whereas `not r32` takes up
* optimizations. * 3 in 16-bit mode, and `adc` is the same size as `cmp` and `xor`.
*
* The algorithm is stolen from "Hacker's Delight", second edition by
* Henry S. Warren, Jr. and painstakingly ported to x86 assembly with
* focus on minimizing the binary size. NB: I have zero clue how CRC
* *actually* works, so there may be room for further optimizations.
* *
* %al: clobber * %al: clobber
* %ebx: return value * %ebx: return value