diff --git a/src/boot/stage0.s b/src/boot/stage0.s index 08a95c5..a510556 100644 --- a/src/boot/stage0.s +++ b/src/boot/stage0.s @@ -37,7 +37,7 @@ * In the case of bussy, we do ALL of the following, in that order: * * 1. relocate ourselves to the end of low memory (0x7800:7c00 = 0x7fc00) - * 2. do some rudimentary sanity checks + * 2. do some rudimentary "sanity" checks * 3. read the boot drive's GPT header and validate its magic and CRC * 4. read the boot drive's GPT and validate its CRC * 5. search the GPT for the partition containing stage1 @@ -47,14 +47,24 @@ * * What's worse, all of that has to happen in Real Mode x86 assembly. Ugh. * And account for all the stupid BIOS bugs since the 80s. Double ugh. + * These constraints force us to resort to some ... advanced space saving + * techniques, resulting in rather obfuscated code: * - * These constraints force us to resort to some advanced space saving techniques - * because literally every single byte counts. As such, we reuse register - * values and side effects, replace instructions that take up multiple bytes - * with smaller ones, combine multiple operations into one, reorder code to be - * as compact as humanly possible, and so much more. At the very least, any CPU - * manufactured in this millennium will easily crunch through all of this within - * a matter of milliseconds at most, so performance is a non-issue. + * - Every subroutine has a completely custom ABI that was carefully chosen to + * minimize register saves in the contexts from which they are called. + * - A lot of stuff relies heavily on side effects/leftover values from previous + * operations, including registers that stay untouched for a long time. + * - The code is arranged in a very specific way in order to maximize the number + * of jumps with 1-byte relative addressing. + * - Some instructions are replaced with similar ones that have additional side + * effects (e.g. `xor` instead of `cmp` if we need 0 in that register anyway). + * - Other instructions are replaced with similar ones that take up less bytes. + * + * We also don't have room for any meaningful error messages, although we do + * compensate for that with error *codes* that i will hopefully document at + * some point (see error_slide). At the very least, any CPU manufactured in + * this millennium will easily crunch through all of this within a matter of + * milliseconds at most, so performance is a total non-issue. * * I tried to keep this code itself location-agnostic and only work with * addresses defined in the linker script, but there are several places where @@ -81,7 +91,8 @@ * OpenZFS is even nice enough to write a GPT to them when creating a pool. * That table contains two entries; one for the ZFS stuff itself and one for * 8 MiB of reserved space that ZFS appears to just leave alone and not touch - * at all. 8 MiB are way more than we'll ever need, + * at all. 8 MiB are probably way more than we'll ever need, especially since + * we only need to store the core image and ZFS driver there. */ .code16 @@ -136,7 +147,7 @@ GLOBL _start /* %di should really be 0x7c00 (we're setting it to 0x7800 here), but a * `mov` between two registers is 1 byte smaller than one with an imm16 */ mov %bx, %di - mov $3, %ch /* 0x300 words (0x7800 to 0x7e00) */ + mov $3, %ch /* 0x300 _words_ (0x7800 to 0x7e00) */ 1: mov (%di), %ax /* use %ds (0x0000) for reading */ stosw /* use %es (0x7800) for writing */ loop 1b @@ -165,10 +176,10 @@ GLOBL _start mov $msg_loader_info, %si call print - /* we kinda abuse %bp, see the "error slide" below */ + /* we kinda abuse %bp, see error_slide below */ mov $'1', %bp - /* check and save our boot drive number */ + /* check our boot drive number */ test $0x80, %dl /* only accept drives 0x80-0xff */ jz err_bad_boot_drive @@ -205,12 +216,18 @@ GLOBL _start /* * Step 4: validate the GPT's magic number and CRCs + * + * XXX The osdev wiki (somewhat vaguely) states that reserved bytes in + * the GPT header should _not_ be included when calculating the CRC. + * This is true for anything from offset 0x5c to the end of the + * sector, but the 4 bytes at 0x14 _are_ included. At least fdisk + * does that; it didn't work when i omitted them. */ /* we'll work with %fs quite a lot over the next lines, so much in fact * that copying it to %ds and saving the segment overrides is worth it */ push %fs - pop %ds + pop %ds /* %ds = 0x0500 */ /* check if the GPT signature is correct (%di points to sector) */ xor %si, %si @@ -220,12 +237,12 @@ GLOBL _start jne err_no_gpt dec %ax /* %eax = 0 */ - xchg 0x08(%si), %eax /* load CRC and replace it with zeroes */ - push %ax /* save CRC[15:0] */ /* '1 v */ + xchg 0x08(%si), %eax /* load expected CRC and replace it with zeroes */ + push %ax /* save expected CRC[15:0] */ /* '1 v */ xor %si, %si mov $0x5c, %cl call crc32 /* %ebx = ~CRC, CF = 1 */ - pop %ax /* restore CRC[15:0] */ /* '1 ^ */ + pop %ax /* restore expected CRC[15:0] */ /* '1 ^ */ adc %eax, %ebx /* check CRC and set %ebx = 0 if equal */ jne err_bad_gpt_csum @@ -459,16 +476,19 @@ END read_lba /* * Calculate the CCITT32 ANSI CRC (polynomial 0x04c11db7) of %cx bytes * at %fs:%si and return THE COMPLEMENT of the result in %ebx. - * Clobbers %al, %cx, and %si. Does not check for overflows (%si + %cx + * Clobbers %al, %cx, and %si. Does NOT check for overflows (%si + %cx * must be <= 0x10000). If %cx is 0, the size is 64 KiB (0x10000), in * which case %si should also be 0 because of the missing wrap check. - * This also sets CF = 1 because that allows for comparing the CRC with - * the expected value through a single `adc` instruction. * - * Stolen from "Hacker's Delight", second edition by Henry S. Warren, Jr. - * and painstakingly ported to x86 assembly with focus on minimum size. - * I have zero clue how CRC *actually* works, so there may be room for - * optimizations. + * This also sets CF = 1, which we use for checking the CRC with a + * single `adc` instruction (x + ~x + 1 = 0 because that's how 2's + * complement works). `stc` is only 1 byte whereas `not r32` takes up + * 3 in 16-bit mode, and `adc` is the same size as `cmp` and `xor`. + * + * The algorithm is stolen from "Hacker's Delight", second edition by + * Henry S. Warren, Jr. and painstakingly ported to x86 assembly with + * focus on minimizing the binary size. NB: I have zero clue how CRC + * *actually* works, so there may be room for further optimizations. * * %al: clobber * %ebx: return value