stage0: improve documentation
This commit is contained in:
parent
326997330b
commit
ede142aed7
1 changed files with 43 additions and 23 deletions
|
@ -37,7 +37,7 @@
|
|||
* In the case of bussy, we do ALL of the following, in that order:
|
||||
*
|
||||
* 1. relocate ourselves to the end of low memory (0x7800:7c00 = 0x7fc00)
|
||||
* 2. do some rudimentary sanity checks
|
||||
* 2. do some rudimentary "sanity" checks
|
||||
* 3. read the boot drive's GPT header and validate its magic and CRC
|
||||
* 4. read the boot drive's GPT and validate its CRC
|
||||
* 5. search the GPT for the partition containing stage1
|
||||
|
@ -47,14 +47,24 @@
|
|||
*
|
||||
* What's worse, all of that has to happen in Real Mode x86 assembly. Ugh.
|
||||
* And account for all the stupid BIOS bugs since the 80s. Double ugh.
|
||||
* These constraints force us to resort to some ... advanced space saving
|
||||
* techniques, resulting in rather obfuscated code:
|
||||
*
|
||||
* These constraints force us to resort to some advanced space saving techniques
|
||||
* because literally every single byte counts. As such, we reuse register
|
||||
* values and side effects, replace instructions that take up multiple bytes
|
||||
* with smaller ones, combine multiple operations into one, reorder code to be
|
||||
* as compact as humanly possible, and so much more. At the very least, any CPU
|
||||
* manufactured in this millennium will easily crunch through all of this within
|
||||
* a matter of milliseconds at most, so performance is a non-issue.
|
||||
* - Every subroutine has a completely custom ABI that was carefully chosen to
|
||||
* minimize register saves in the contexts from which they are called.
|
||||
* - A lot of stuff relies heavily on side effects/leftover values from previous
|
||||
* operations, including registers that stay untouched for a long time.
|
||||
* - The code is arranged in a very specific way in order to maximize the number
|
||||
* of jumps with 1-byte relative addressing.
|
||||
* - Some instructions are replaced with similar ones that have additional side
|
||||
* effects (e.g. `xor` instead of `cmp` if we need 0 in that register anyway).
|
||||
* - Other instructions are replaced with similar ones that take up less bytes.
|
||||
*
|
||||
* We also don't have room for any meaningful error messages, although we do
|
||||
* compensate for that with error *codes* that i will hopefully document at
|
||||
* some point (see error_slide). At the very least, any CPU manufactured in
|
||||
* this millennium will easily crunch through all of this within a matter of
|
||||
* milliseconds at most, so performance is a total non-issue.
|
||||
*
|
||||
* I tried to keep this code itself location-agnostic and only work with
|
||||
* addresses defined in the linker script, but there are several places where
|
||||
|
@ -81,7 +91,8 @@
|
|||
* OpenZFS is even nice enough to write a GPT to them when creating a pool.
|
||||
* That table contains two entries; one for the ZFS stuff itself and one for
|
||||
* 8 MiB of reserved space that ZFS appears to just leave alone and not touch
|
||||
* at all. 8 MiB are way more than we'll ever need,
|
||||
* at all. 8 MiB are probably way more than we'll ever need, especially since
|
||||
* we only need to store the core image and ZFS driver there.
|
||||
*/
|
||||
|
||||
.code16
|
||||
|
@ -136,7 +147,7 @@ GLOBL _start
|
|||
/* %di should really be 0x7c00 (we're setting it to 0x7800 here), but a
|
||||
* `mov` between two registers is 1 byte smaller than one with an imm16 */
|
||||
mov %bx, %di
|
||||
mov $3, %ch /* 0x300 words (0x7800 to 0x7e00) */
|
||||
mov $3, %ch /* 0x300 _words_ (0x7800 to 0x7e00) */
|
||||
1: mov (%di), %ax /* use %ds (0x0000) for reading */
|
||||
stosw /* use %es (0x7800) for writing */
|
||||
loop 1b
|
||||
|
@ -165,10 +176,10 @@ GLOBL _start
|
|||
mov $msg_loader_info, %si
|
||||
call print
|
||||
|
||||
/* we kinda abuse %bp, see the "error slide" below */
|
||||
/* we kinda abuse %bp, see error_slide below */
|
||||
mov $'1', %bp
|
||||
|
||||
/* check and save our boot drive number */
|
||||
/* check our boot drive number */
|
||||
test $0x80, %dl /* only accept drives 0x80-0xff */
|
||||
jz err_bad_boot_drive
|
||||
|
||||
|
@ -205,12 +216,18 @@ GLOBL _start
|
|||
|
||||
/*
|
||||
* Step 4: validate the GPT's magic number and CRCs
|
||||
*
|
||||
* XXX The osdev wiki (somewhat vaguely) states that reserved bytes in
|
||||
* the GPT header should _not_ be included when calculating the CRC.
|
||||
* This is true for anything from offset 0x5c to the end of the
|
||||
* sector, but the 4 bytes at 0x14 _are_ included. At least fdisk
|
||||
* does that; it didn't work when i omitted them.
|
||||
*/
|
||||
|
||||
/* we'll work with %fs quite a lot over the next lines, so much in fact
|
||||
* that copying it to %ds and saving the segment overrides is worth it */
|
||||
push %fs
|
||||
pop %ds
|
||||
pop %ds /* %ds = 0x0500 */
|
||||
|
||||
/* check if the GPT signature is correct (%di points to sector) */
|
||||
xor %si, %si
|
||||
|
@ -220,12 +237,12 @@ GLOBL _start
|
|||
jne err_no_gpt
|
||||
|
||||
dec %ax /* %eax = 0 */
|
||||
xchg 0x08(%si), %eax /* load CRC and replace it with zeroes */
|
||||
push %ax /* save CRC[15:0] */ /* '1 v */
|
||||
xchg 0x08(%si), %eax /* load expected CRC and replace it with zeroes */
|
||||
push %ax /* save expected CRC[15:0] */ /* '1 v */
|
||||
xor %si, %si
|
||||
mov $0x5c, %cl
|
||||
call crc32 /* %ebx = ~CRC, CF = 1 */
|
||||
pop %ax /* restore CRC[15:0] */ /* '1 ^ */
|
||||
pop %ax /* restore expected CRC[15:0] */ /* '1 ^ */
|
||||
adc %eax, %ebx /* check CRC and set %ebx = 0 if equal */
|
||||
jne err_bad_gpt_csum
|
||||
|
||||
|
@ -459,16 +476,19 @@ END read_lba
|
|||
/*
|
||||
* Calculate the CCITT32 ANSI CRC (polynomial 0x04c11db7) of %cx bytes
|
||||
* at %fs:%si and return THE COMPLEMENT of the result in %ebx.
|
||||
* Clobbers %al, %cx, and %si. Does not check for overflows (%si + %cx
|
||||
* Clobbers %al, %cx, and %si. Does NOT check for overflows (%si + %cx
|
||||
* must be <= 0x10000). If %cx is 0, the size is 64 KiB (0x10000), in
|
||||
* which case %si should also be 0 because of the missing wrap check.
|
||||
* This also sets CF = 1 because that allows for comparing the CRC with
|
||||
* the expected value through a single `adc` instruction.
|
||||
*
|
||||
* Stolen from "Hacker's Delight", second edition by Henry S. Warren, Jr.
|
||||
* and painstakingly ported to x86 assembly with focus on minimum size.
|
||||
* I have zero clue how CRC *actually* works, so there may be room for
|
||||
* optimizations.
|
||||
* This also sets CF = 1, which we use for checking the CRC with a
|
||||
* single `adc` instruction (x + ~x + 1 = 0 because that's how 2's
|
||||
* complement works). `stc` is only 1 byte whereas `not r32` takes up
|
||||
* 3 in 16-bit mode, and `adc` is the same size as `cmp` and `xor`.
|
||||
*
|
||||
* The algorithm is stolen from "Hacker's Delight", second edition by
|
||||
* Henry S. Warren, Jr. and painstakingly ported to x86 assembly with
|
||||
* focus on minimizing the binary size. NB: I have zero clue how CRC
|
||||
* *actually* works, so there may be room for further optimizations.
|
||||
*
|
||||
* %al: clobber
|
||||
* %ebx: return value
|
||||
|
|
Loading…
Reference in a new issue