stage0: improve documentation
This commit is contained in:
parent
326997330b
commit
ede142aed7
1 changed files with 43 additions and 23 deletions
|
@ -37,7 +37,7 @@
|
||||||
* In the case of bussy, we do ALL of the following, in that order:
|
* In the case of bussy, we do ALL of the following, in that order:
|
||||||
*
|
*
|
||||||
* 1. relocate ourselves to the end of low memory (0x7800:7c00 = 0x7fc00)
|
* 1. relocate ourselves to the end of low memory (0x7800:7c00 = 0x7fc00)
|
||||||
* 2. do some rudimentary sanity checks
|
* 2. do some rudimentary "sanity" checks
|
||||||
* 3. read the boot drive's GPT header and validate its magic and CRC
|
* 3. read the boot drive's GPT header and validate its magic and CRC
|
||||||
* 4. read the boot drive's GPT and validate its CRC
|
* 4. read the boot drive's GPT and validate its CRC
|
||||||
* 5. search the GPT for the partition containing stage1
|
* 5. search the GPT for the partition containing stage1
|
||||||
|
@ -47,14 +47,24 @@
|
||||||
*
|
*
|
||||||
* What's worse, all of that has to happen in Real Mode x86 assembly. Ugh.
|
* What's worse, all of that has to happen in Real Mode x86 assembly. Ugh.
|
||||||
* And account for all the stupid BIOS bugs since the 80s. Double ugh.
|
* And account for all the stupid BIOS bugs since the 80s. Double ugh.
|
||||||
|
* These constraints force us to resort to some ... advanced space saving
|
||||||
|
* techniques, resulting in rather obfuscated code:
|
||||||
*
|
*
|
||||||
* These constraints force us to resort to some advanced space saving techniques
|
* - Every subroutine has a completely custom ABI that was carefully chosen to
|
||||||
* because literally every single byte counts. As such, we reuse register
|
* minimize register saves in the contexts from which they are called.
|
||||||
* values and side effects, replace instructions that take up multiple bytes
|
* - A lot of stuff relies heavily on side effects/leftover values from previous
|
||||||
* with smaller ones, combine multiple operations into one, reorder code to be
|
* operations, including registers that stay untouched for a long time.
|
||||||
* as compact as humanly possible, and so much more. At the very least, any CPU
|
* - The code is arranged in a very specific way in order to maximize the number
|
||||||
* manufactured in this millennium will easily crunch through all of this within
|
* of jumps with 1-byte relative addressing.
|
||||||
* a matter of milliseconds at most, so performance is a non-issue.
|
* - Some instructions are replaced with similar ones that have additional side
|
||||||
|
* effects (e.g. `xor` instead of `cmp` if we need 0 in that register anyway).
|
||||||
|
* - Other instructions are replaced with similar ones that take up less bytes.
|
||||||
|
*
|
||||||
|
* We also don't have room for any meaningful error messages, although we do
|
||||||
|
* compensate for that with error *codes* that i will hopefully document at
|
||||||
|
* some point (see error_slide). At the very least, any CPU manufactured in
|
||||||
|
* this millennium will easily crunch through all of this within a matter of
|
||||||
|
* milliseconds at most, so performance is a total non-issue.
|
||||||
*
|
*
|
||||||
* I tried to keep this code itself location-agnostic and only work with
|
* I tried to keep this code itself location-agnostic and only work with
|
||||||
* addresses defined in the linker script, but there are several places where
|
* addresses defined in the linker script, but there are several places where
|
||||||
|
@ -81,7 +91,8 @@
|
||||||
* OpenZFS is even nice enough to write a GPT to them when creating a pool.
|
* OpenZFS is even nice enough to write a GPT to them when creating a pool.
|
||||||
* That table contains two entries; one for the ZFS stuff itself and one for
|
* That table contains two entries; one for the ZFS stuff itself and one for
|
||||||
* 8 MiB of reserved space that ZFS appears to just leave alone and not touch
|
* 8 MiB of reserved space that ZFS appears to just leave alone and not touch
|
||||||
* at all. 8 MiB are way more than we'll ever need,
|
* at all. 8 MiB are probably way more than we'll ever need, especially since
|
||||||
|
* we only need to store the core image and ZFS driver there.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
.code16
|
.code16
|
||||||
|
@ -136,7 +147,7 @@ GLOBL _start
|
||||||
/* %di should really be 0x7c00 (we're setting it to 0x7800 here), but a
|
/* %di should really be 0x7c00 (we're setting it to 0x7800 here), but a
|
||||||
* `mov` between two registers is 1 byte smaller than one with an imm16 */
|
* `mov` between two registers is 1 byte smaller than one with an imm16 */
|
||||||
mov %bx, %di
|
mov %bx, %di
|
||||||
mov $3, %ch /* 0x300 words (0x7800 to 0x7e00) */
|
mov $3, %ch /* 0x300 _words_ (0x7800 to 0x7e00) */
|
||||||
1: mov (%di), %ax /* use %ds (0x0000) for reading */
|
1: mov (%di), %ax /* use %ds (0x0000) for reading */
|
||||||
stosw /* use %es (0x7800) for writing */
|
stosw /* use %es (0x7800) for writing */
|
||||||
loop 1b
|
loop 1b
|
||||||
|
@ -165,10 +176,10 @@ GLOBL _start
|
||||||
mov $msg_loader_info, %si
|
mov $msg_loader_info, %si
|
||||||
call print
|
call print
|
||||||
|
|
||||||
/* we kinda abuse %bp, see the "error slide" below */
|
/* we kinda abuse %bp, see error_slide below */
|
||||||
mov $'1', %bp
|
mov $'1', %bp
|
||||||
|
|
||||||
/* check and save our boot drive number */
|
/* check our boot drive number */
|
||||||
test $0x80, %dl /* only accept drives 0x80-0xff */
|
test $0x80, %dl /* only accept drives 0x80-0xff */
|
||||||
jz err_bad_boot_drive
|
jz err_bad_boot_drive
|
||||||
|
|
||||||
|
@ -205,12 +216,18 @@ GLOBL _start
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Step 4: validate the GPT's magic number and CRCs
|
* Step 4: validate the GPT's magic number and CRCs
|
||||||
|
*
|
||||||
|
* XXX The osdev wiki (somewhat vaguely) states that reserved bytes in
|
||||||
|
* the GPT header should _not_ be included when calculating the CRC.
|
||||||
|
* This is true for anything from offset 0x5c to the end of the
|
||||||
|
* sector, but the 4 bytes at 0x14 _are_ included. At least fdisk
|
||||||
|
* does that; it didn't work when i omitted them.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
/* we'll work with %fs quite a lot over the next lines, so much in fact
|
/* we'll work with %fs quite a lot over the next lines, so much in fact
|
||||||
* that copying it to %ds and saving the segment overrides is worth it */
|
* that copying it to %ds and saving the segment overrides is worth it */
|
||||||
push %fs
|
push %fs
|
||||||
pop %ds
|
pop %ds /* %ds = 0x0500 */
|
||||||
|
|
||||||
/* check if the GPT signature is correct (%di points to sector) */
|
/* check if the GPT signature is correct (%di points to sector) */
|
||||||
xor %si, %si
|
xor %si, %si
|
||||||
|
@ -220,12 +237,12 @@ GLOBL _start
|
||||||
jne err_no_gpt
|
jne err_no_gpt
|
||||||
|
|
||||||
dec %ax /* %eax = 0 */
|
dec %ax /* %eax = 0 */
|
||||||
xchg 0x08(%si), %eax /* load CRC and replace it with zeroes */
|
xchg 0x08(%si), %eax /* load expected CRC and replace it with zeroes */
|
||||||
push %ax /* save CRC[15:0] */ /* '1 v */
|
push %ax /* save expected CRC[15:0] */ /* '1 v */
|
||||||
xor %si, %si
|
xor %si, %si
|
||||||
mov $0x5c, %cl
|
mov $0x5c, %cl
|
||||||
call crc32 /* %ebx = ~CRC, CF = 1 */
|
call crc32 /* %ebx = ~CRC, CF = 1 */
|
||||||
pop %ax /* restore CRC[15:0] */ /* '1 ^ */
|
pop %ax /* restore expected CRC[15:0] */ /* '1 ^ */
|
||||||
adc %eax, %ebx /* check CRC and set %ebx = 0 if equal */
|
adc %eax, %ebx /* check CRC and set %ebx = 0 if equal */
|
||||||
jne err_bad_gpt_csum
|
jne err_bad_gpt_csum
|
||||||
|
|
||||||
|
@ -459,16 +476,19 @@ END read_lba
|
||||||
/*
|
/*
|
||||||
* Calculate the CCITT32 ANSI CRC (polynomial 0x04c11db7) of %cx bytes
|
* Calculate the CCITT32 ANSI CRC (polynomial 0x04c11db7) of %cx bytes
|
||||||
* at %fs:%si and return THE COMPLEMENT of the result in %ebx.
|
* at %fs:%si and return THE COMPLEMENT of the result in %ebx.
|
||||||
* Clobbers %al, %cx, and %si. Does not check for overflows (%si + %cx
|
* Clobbers %al, %cx, and %si. Does NOT check for overflows (%si + %cx
|
||||||
* must be <= 0x10000). If %cx is 0, the size is 64 KiB (0x10000), in
|
* must be <= 0x10000). If %cx is 0, the size is 64 KiB (0x10000), in
|
||||||
* which case %si should also be 0 because of the missing wrap check.
|
* which case %si should also be 0 because of the missing wrap check.
|
||||||
* This also sets CF = 1 because that allows for comparing the CRC with
|
|
||||||
* the expected value through a single `adc` instruction.
|
|
||||||
*
|
*
|
||||||
* Stolen from "Hacker's Delight", second edition by Henry S. Warren, Jr.
|
* This also sets CF = 1, which we use for checking the CRC with a
|
||||||
* and painstakingly ported to x86 assembly with focus on minimum size.
|
* single `adc` instruction (x + ~x + 1 = 0 because that's how 2's
|
||||||
* I have zero clue how CRC *actually* works, so there may be room for
|
* complement works). `stc` is only 1 byte whereas `not r32` takes up
|
||||||
* optimizations.
|
* 3 in 16-bit mode, and `adc` is the same size as `cmp` and `xor`.
|
||||||
|
*
|
||||||
|
* The algorithm is stolen from "Hacker's Delight", second edition by
|
||||||
|
* Henry S. Warren, Jr. and painstakingly ported to x86 assembly with
|
||||||
|
* focus on minimizing the binary size. NB: I have zero clue how CRC
|
||||||
|
* *actually* works, so there may be room for further optimizations.
|
||||||
*
|
*
|
||||||
* %al: clobber
|
* %al: clobber
|
||||||
* %ebx: return value
|
* %ebx: return value
|
||||||
|
|
Loading…
Reference in a new issue