You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

674 lines
21 KiB
ArmAsm

/*
* fef
*
* S T A G E Z E R O
*
* a dystopian novel
*/
/*
* PREAMBLE
*
* This place is a message ...
* and part of a system of messages ...
* pay attention to it!
*
* Sending this message was important to us.
* We considered ourselves to be a powerful culture.
*
* This place is not a place of honor ...
* no highly esteemed deed is commemorated here ...
* nothing valued is here.
*
* What is here was dangerous and repulsive to us.
* This message is a warning about danger.
*
* The danger is still present, in your time, as it was in ours.
*
* This place is best shunned and left uninhabited.
*/
/*
* PROLOGUE
*
* Shitposting aside (and ignoring the fact that this entire file is a shitpost
* to begin with), bootsectors are kind of awkward because we have to squeeze
* quite a lot of stuff into a lousy 440 bytes.
* In the case of bussy, we do ALL of the following, in that order:
*
* 1. relocate ourselves to the end of low memory (0x7800:7c00 = 0x7fc00)
* 2. do some rudimentary "sanity" checks
* 3. read the boot drive's GPT header and validate its magic and CRC
* 4. read the boot drive's GPT and validate its CRC
* 5. search the GPT for the partition containing stage1
* 6. copy stage1 to the beginning of low memory (0x0000:0500 = 0x00500)
* 7. validate stage1's magic number and CRC
* 8. jump to the stage 1 entry point
*
* What's worse, all of that has to happen in Real Mode x86 assembly. Ugh.
* And account for all the stupid BIOS bugs since the 80s. Double ugh.
* These constraints force us to resort to some ... advanced space saving
* techniques, resulting in rather obfuscated code:
*
* - Every subroutine has a completely custom ABI that was carefully chosen to
* minimize register saves in the contexts from which they are called.
* - A lot of stuff relies heavily on side effects/leftover values from previous
* operations, including registers that stay untouched for a long time.
* - The code is arranged in a very specific way in order to maximize the number
* of jumps with 1-byte relative addressing.
* - Some instructions are replaced with similar ones that have additional side
* effects (e.g. `xor` instead of `cmp` if we need 0 in that register anyway).
* - Other instructions are replaced with similar ones that take up less bytes.
*
* We also don't have room for any meaningful error messages, although we do
* compensate for that with error *codes* that i will hopefully document at
* some point (see error_slide). At the very least, any CPU manufactured in
* this millennium will easily crunch through all of this within a matter of
* milliseconds at most, so performance is a total non-issue.
*
* I tried to keep this code itself location-agnostic and only work with
* addresses defined in the linker script, but there are several places where
* that's just not possible. And since this is in no way useful other than the
* highly specific IBM PC environment, i figured it doesn't really matter if i
* hardcode some stuff. Sorry :)
*
* Keeping track of the stack can become rather difficult, because some values
* stay there for quite a while and there is NO %bp (see "error_slide").
* Therefore, i annotated all stack transactions with "lifetimes" in a separate
* comment towards the far end of the line. The number indicates the current
* stack height AFTER a push and BEFORE a pop. The `v` and `^` are "arrows"
* that denote whether it is a push or pop respectively:
* (1) they indicate the direction in which you have to scroll to get to the
* corresponding counterpart to the instruction
* (2) if you imagine memory as some sort of pillar where addresses grow in the
* upward direction, the "arrows" point in the direction that %sp moves
* (3) if you imagine a physical stack of things, the "arrows" match the
* direction of putting things on the stack (down) or lifting them off (up)
*
* Note that this MBR is also used for booting from ZFS drives (when ZFS is
* given control of the entire drive, rather than just a partition) because the
* ZFS folks had enough foresight to reserve the first couple of sectors, and
* OpenZFS is even nice enough to write a GPT to them when creating a pool.
* That table contains two entries; one for the ZFS stuff itself and one for
* 8 MiB of reserved space that ZFS appears to just leave alone and not touch
* at all. 8 MiB are probably way more than we'll ever need, especially since
* we only need to store the core image and ZFS driver there.
*/
.code16
.macro LOCAL name, type=function
.type \name , %\type
\name :
.endm
.macro GLOBL name, type
.global \name
LOCAL \name , \type
.endm
.macro END name
.size \name , . - \name
.endm
/*
* CHAPTER 1
*
* "Bootstrap Routine"
*
* 0x7c00
*/
.text
GLOBL _start
cli
cld
/*
* Step 1: Relocate to 0x7fc00
*
* Since we have to use segments to access anything above 0xffff anyway,
* this binary is statically linked against the physical load address of
* 0x7c00. That's not too bad though, because we have to initialize all
* segment registers one way or the other (can't rely on the BIOS doing
* that for us, especially because some of them appear to have this fun
* quirk where they jump to 0x07c0:0000 instead of 0x0000:7c00).
*
* This assumes that the entire memory used by stage0 (including .bss
* and stack) is <= 0x400 bytes in size, which is a pretty conservative
* guess. In practice, it's probably more like 0x300 at most.
*/
xor %cx, %cx
mov %cx, %ds
mov $0x7800, %bx /* (0x7fc00 - 0x7c00) >> 4 */
mov %bx, %es
/* %di should really be 0x7c00 (we're setting it to 0x7800 here), but a
* `mov` between two registers is 1 byte smaller than one with an imm16 */
mov %bx, %di
mov $3, %ch /* 0x300 _words_ (0x7800 to 0x7e00) */
1: mov (%di), %ax /* use %ds (0x0000) for reading */
stosw /* use %es (0x7800) for writing */
loop 1b
/* this loop ends with 0xaa55 (MBR magic) in %ax, which we reuse later */
mov %bx, %ds
mov %bx, %ss
/* XXX gdb gets a stroke when we relocate ourselves */
ljmp $0x7800, $2f
/*
* Step 2: Rudimentary sanity checks
*/
/* Clear .bss and initialize %sp to the end of low memory. The previous
* loop ended with %di = 0x7e00, %cx = 0x0000, and %bx = 0x7800.
* `xchg %ax, r16` takes up only 1 byte as opposed to `xor r16, r16`
* which takes 2. It also moves the 0xaa55 to %bx, where we need it. */
2: xchg %ax, %bx
mov $2, %ch /* 0x200 bytes (0x7e00 to 0x8000) */
rep stosb
mov %di, %sp /* %sp = 0x7800:8000 = 0x80000 */
sti /* we're safe again, i think */
/* say hello */
mov $msg_loader_info, %si
call print
/* we kinda abuse %bp, see error_slide below */
mov $'1', %bp
/* check our boot drive number */
test $0x80, %dl /* only accept drives 0x80-0xff */
jz err_bad_boot_drive
/* check whether the BIOS supports IBM int13 extensions */
mov $0x41, %ah
not %bx /* %bx = 0x55aa */
int $0x13
mov $0, %dh /* now is the perfect opportunity to clear %dh */
jc err_no_int13_extensions
xor $0xaa55, %bx /* clear %bx on success */
jnz err_bad_boot_drive
/*
* Step 3: Read and validate the boot drive's GPT header
*/
/* get the boot drive's logical sector size */
mov $0x48, %ah
mov $drive_params, %si
movb $(drive_params_end - drive_params), (%si)
int $0x13 /* int13/48 clears %ah and CF on success */
mov $0, %al
adc %ax, %ax
jnz err_bad_boot_drive
/* read LBA 1 (where the GPT header is supposed to be) */
mov $0x50, %al /* we just checked that %ah is 0 */
mov %ax, %fs
xor %eax, %eax
xor %di, %di
inc %ax /* %eax = 1 */
mov %ax, %cx /* %cx = 1 */
call read_lba /* LBA = 0x00000001, dest = 0x0050:0000 */
/*
* Step 4: validate the GPT's magic number and CRCs
*
* XXX The osdev wiki (somewhat vaguely) states that reserved bytes in
* the GPT header should _not_ be included when calculating the CRC.
* This is true for anything from offset 0x5c to the end of the
* sector, but the 4 bytes at 0x14 _are_ included. At least fdisk
* does that; it didn't work when i omitted them.
*/
/* we'll work with %fs quite a lot over the next lines, so much in fact
* that copying it to %ds and saving the segment overrides is worth it */
push %fs
pop %ds /* %ds = 0x0500 */
/* check if the GPT signature is correct (%di points to sector) */
xor %si, %si
mov $gpt_magic, %di
mov $(gpt_magic_end - gpt_magic), %cl /* %cx was 1 */
repe cmpsb %es:(%di), %ds:(%si)
jne err_no_gpt
dec %ax /* %eax = 0 */
xchg 0x08(%si), %eax /* load expected CRC and replace it with zeroes */
push %ax /* save expected CRC[15:0] */ /* '1 v */
xor %si, %si
mov $0x5c, %cl
call crc32 /* %ebx = ~CRC, CF = 1 */
pop %ax /* restore expected CRC[15:0] */ /* '1 ^ */
adc %eax, %ebx /* check CRC and set %ebx = 0 if equal */
jne err_bad_gpt_csum
/* store everything we need to rember from the GPT header (we assert there
* to be no more than 65535 partitions and that each entry is no more than
* 65535 bytes, because if that's not the case we're screwed anyway) */
mov $0x0048, %si
lods (%si), %eax /* first LBA of partition table */
mov (%si), %bx /* first LBA of partition table, high bits */
mov 0x04(%si), %cx /* number of partition entries */
mov 0x0c(%si), %edi /* CRC32 of the partition entry array */
mov 0x08(%si), %si /* size of each partition entry in bytes */
/* restore %ds to 0x7800 */
push %es
pop %ds
/* Calculate how many sectors we have to read for the partition table.
* This assumes that the entire partition table is less than 64 KiB. */
xchg %ax, %si /* %ax = entry size, %si = LBA[15:0] */
push %ax /* save entry size */ /* '1 v */
push %di /* save crc[15:0] */ /* '2 v */
push %dx /* save drive number */ /* '3 v */
mul %cx /* %dx:%ax = entry size * entry count */
jno start2 /* fail if %dx != 0 (`mul` overflowed %ax) */
/* pardon the sudden interruption; we have to insert the error slide
* here because that way we maximize the number of jumps with 1-byte
* relative addresses, which saves a couple of bytes here and there */
END _start
/*
* Welcome to the "error slide"!
* This ... thing relies on the fact that we don't quite use %bp for
* its intended purpose and instead always keep it at 0x31 (ASCII '1').
* The reason for this is that errors require a jump anyway, which
* takes up at least 2 bytes, while `inc %bp` requires only 1 byte.
* Therefore, we quite literally "encode" the error code through the
* jump instruction by means of jumping to the respective offset.
*/
LOCAL error_slide
/* err_gpt_too_big is carefully placed so that the `jno` above falls
* through exactly here on failure, avoiding an unconditional jump */
err_gpt_too_big:
inc %bp
err_bad_stage1_csum:
inc %bp
err_bad_stage1_magic:
inc %bp
err_bad_boot_drive:
inc %bp
err_no_int13_extensions:
inc %bp
err_bad_gpt_csum:
inc %bp
err_no_gpt:
inc %bp
err_read_failed:
inc %bp
err_no_stage1:
mov $msg_error, %si
/* 6 is the offset of the error number character within the error
* message, see the definition of msg_error below. We use %es
* because that's the only segment always pointing to 0x7800. */
or %bp, %es:6(%si)
call print
/*
* The GRUB uses int18 ("diskless boot hook" aka "start cassette BASIC")
* in its error routine, so we do the same. Issuing this interupt does
* any of the following (see <http://www.ctyme.com/intr/rb-2241.htm>):
* - start BASIC from the integrated ROM (on the OG IBM PC)
* - reboot the system
* - display some kind of error message
* - attempt a network boot
* - try some other boot device
* - nothing lmao
*/
int $0x18
/* if we make it here, it's over for good */
die: cli
hlt
jmp die
END error_slide
LOCAL start2
mov %ax, %di /* %di = table size in bytes */
divw (drive_params + 0x18) /* %ax = table size / sector size */
test %dx, %dx /* check remainder */
jz 1f
/* this increment can't overflow because we already checked that the
* dividend is <= 0xffff, so unless we're dealing with a 1-byte sector
* disk we can't possibly get this anywhere near the 16-bit limit */
inc %ax /* round up if modulus > 0 */
1: pop %dx /* restore drive number */ /* '3 ^ */
xchg %ax, %cx /* %ax = entry count, %cx = sector count */
xchg %ax, %si /* %ax = LBA[15:0], %si = entry count */
push %di /* save table size in bytes */ /* '3 v */
xor %di, %di
call read_lba
/* check the partition table array's CRC */
pop %cx /* restore table size in bytes */ /* '3 ^ */
push %si /* save entry count */ /* '3 v */
xor %si, %si
call crc32 /* %ebx = ~CRC, CF = 1 */
pop %cx /* restore entry count */ /* '3 ^ */
pop %di /* restore CRC[15:0] */ /* '2 ^ */
adc %ebx, %edi /* %edi = 0 if checksum matches */
jnz err_bad_gpt_csum
/*
* Step 5: Search the GPT for stage1
*/
pop %bx /* restore entry size */ /* '1 ^ */
xor %si, %si
1: push %cx /* save remaining entry count */ /* '1 v */
push %si /* save current position */ /* '2 v */
mov $bussy_guid, %di
mov $(bussy_guid_end - bussy_guid), %cx
repe cmpsb %es:(%di), %fs:(%si)
pop %si /* restore current position */ /* '2 ^ */
pop %cx /* restore remaining entry count */ /* '1 ^ */
je 2f /* found it */
add %bx, %si /* %si += entry size */
loopne 1b /* only loop if %di didn't wrap around */
jmp err_no_stage1
/*
* Step 6: Load stage1 to the beginning of low memory (0x0500)
*
* stage1 must not exceed 32 K because some BIOSes freak out when asked
* to load 64 K or more at once (which would cross a segment boundary).
* We take advantage of that fact by "hardcoding" the number of bytes to
* be loaded: We start with 64 K - 1 B, and divide that through the
* logical sector size. The result is a sector count that definitely
* spans at least 32 K unless we're dealing with 64 K sectors or larger,
* in which case we wouldn't have made it this far anyway.
*/
2: push %dx /* save drive number */ /* '1 v */
or $-1, %ax
xor %dx, %dx
divw (drive_params + 0x18) /* %ax = 0x8000 / sector size = sector count */
xchg %ax, %cx /* %cx = sector count (xchg saves 1 byte) */
mov %fs:0x20(%si), %eax /* %eax = LBA[31:0] */
mov %fs:0x24(%si), %bx /* %bx = LBA[47:32] */
xor %di, %di
pop %dx /* restore drive number */ /* '1 ^ */
call read_lba
/* these values are for stage1 */
push %eax /* boot partition LBA[31:0] */ /* '1 v */
push %bx /* boot partition LBA[47:32] */ /* '2 v */
push %dx /* boot drive number */ /* '3 v */
/*
* Step 7: Check stage1's magic number and CRC
*/
xor %si, %si
cmpw $0xacab, %fs:(%si) /* offset 0x00: magic number */
jne err_bad_stage1_magic
mov %fs:0x02(%si), %cx /* offset 0x02: byte count for CRC */
xor %eax, %eax
xchg %fs:0x04(%si), %eax /* offset 0x04: CRC (replace with 0) */
push %ax /* save CRC[15:0] (crc32 clobbers %al) */ /* '4 v */
call crc32 /* %ebx = ~CRC, CF = 1 */
pop %ax /* restore CRC[15:0] */ /* '4 ^ */
adc %ebx, %eax /* check CRC */
jnz err_bad_stage1_csum
/*
* Step 8: Jump to stage1 (finally)
*
* The three remaining items on the stack are for stage1
*/
ljmp $0x0000, $0x0510 /* entry point is right after the 16-byte header */
END start2
/*
* CHAPTER 2
*
* "Utility Subroutines"
*
* 0x7c00 + N
*/
/*
* Load %cl sectors, starting at LBA %bx:%eax, from disk %dl to %fs:%di.
* This subroutine preserves all registers except FLAGS,
* and automatically jumps to err_read_failed on failures.
*
* %eax: logical block address (sector number), bits 31:0
* %bx: logical block address (sector number), bits 47:32
* %cx: # of sectors to read (should be < 128)
* %dl: disk number
* %di: destination
* %fs: destination segment
*/
LOCAL read_lba
pusha
mov $dap, %si
movb $(dap_end - dap), (%si)
mov %cx, 2(%si)
mov %di, 4(%si)
mov %fs, %cx
mov %cx, 6(%si)
movl %eax, 8(%si)
mov %bx, 12(%si)
mov $0x42, %ah
int $0x13
/* You're supposed to retry several times if this fails because
* *floppies* tend to be quite unreliable. We just assume that
* nobody in their right mind would boot off of a floppy anymore
* unless they're into retro tech, in which case they wouldn't use
* a modern bootloader on their machine to begin with. HDDs/SSDs
* should be reliable enough that we can assume it always works. */
mov $0, %al
adc %ax, %ax /* CF and %ah are 0 on success */
jnz err_read_failed
popa
ret
END read_lba
/*
* Calculate the CCITT32 ANSI CRC (polynomial 0x04c11db7) of %cx bytes
* at %fs:%si and return THE COMPLEMENT of the result in %ebx.
* Clobbers %al, %cx, and %si. Does NOT check for overflows (%si + %cx
* must be <= 0x10000). If %cx is 0, the size is 64 KiB (0x10000), in
* which case %si should also be 0 because of the missing wrap check.
*
* This also sets CF = 1, which we use for checking the CRC with a
* single `adc` instruction (x + ~x + 1 = 0 because that's how 2's
* complement works). `stc` is only 1 byte whereas `not r32` takes up
* 3 in 16-bit mode, and `adc` is the same size as `cmp` and `xor`.
*
* The algorithm is stolen from "Hacker's Delight", second edition by
* Henry S. Warren, Jr. and painstakingly ported to x86 assembly with
* focus on minimizing the binary size. NB: I have zero clue how CRC
* *actually* works, so there may be room for further optimizations.
*
* %al: clobber
* %ebx: return value
* %cx: byte count (clobber)
* %si: data (clobber)
* %fs: data segment
* CF = 1
*/
LOCAL crc32
/* this is expressed as an imm8 w/ sign extension */
or $-1, %ebx /* %ebx = 0xffffffff */
1: lods %fs:(%si), %al
xor %al, %bl
push %cx
mov $8, %cx
2: shr %ebx
jnc 3f
xor $0xedb88320, %ebx
3: loop 2b
pop %cx
loop 1b
stc
ret
END crc32
/*
* %si: address of string (terminated with last char | 0x80)
* %es: segment of string
*/
LOCAL print
/* according to <http://www.ctyme.com/intr/rb-0106.htm>, some BIOSes
* destroy %bp when the write causes the screen to scroll (???) */
pusha
1: lods %es:(%si), %al
/* Strings are ASCII only, so we know bit 7 (MSB) is always 0.
* We use that as a terminator to save space (stolen from BSD). */
btr $7, %ax
mov $0x0001, %bx /* page = 00, foreground color = 01 */
mov $0x0e, %ah /* print char, teletype mode */
int $0x10
jnc 1b /* XXX this breaks if BIOS trashes CF */
popa
ret
END print
/*
* CHAPTER 3
*
* "Constants and Variables"
*
* 0x7c00 + N + M
*/
.data
.macro TERMINATOR char
.byte \char | 0x80
.endm
/*
* The error_slide subroutine will replace the `\0` character with the
* actual error code. ATTENTION: the 6-byte offset is hardcoded!
*/
LOCAL msg_error, object
.ascii "error \0\r"
TERMINATOR '\n'
END msg_error
.section .rodata
LOCAL msg_loader_info, object
.ascii "BUSSY"
TERMINATOR ' '
END msg_loader_info
LOCAL gpt_magic, object
.ascii "EFI PART"
gpt_magic_end:
END gpt_magic
/* bussy boot partition GUID (61476542-4479-436f-7269-6d6521557755) */
LOCAL bussy_guid, object
.ascii "BeGayDoCrime!UwU"
bussy_guid_end:
END bussy_guid
/*
* CHAPTER 4
*
* "MBR Header"
*
* 0x7db8
*/
.section .header, "a", "progbits"
GLOBL mbr_uid, object
.long 0xffffffff
END mbr_uid
GLOBL mbr_rsvd, object
.word 0
END mbr_rsvd
/* first MBR Partition Table Entry contains the protective thingy */
GLOBL mbr_pte1, object
/* 0 */ .byte 0x80 /* boot indicator flag */
/* 1 */ .byte 0x02 /* starting head */
/* 2 */ .word 0x0002 /* starting cylinder[15:6]/sector[5:0] */
/* 4 */ .byte 0xee /* system id (GPT protective) */
/* 5 */ .byte 0xff /* ending head */
/* 6 */ .word 0xffff /* ending cylinder[15:6]/sector[5:0] */
/* 8 */ .word 0x0001 /* starting LBA, low word */
/* a */ .word 0x0000 /* starting LBA, high word */
/* c */ .word 0xffff /* number of sectors, low word */
/* e */ .word 0xffff /* number of sectors, high word */
END mbr_pte1
.macro PTE_EMPTY num
GLOBL mbr_pte\num , object
.rep 16
.byte 0
.endr
END mbr_pte\num
.endm
PTE_EMPTY 2
PTE_EMPTY 3
PTE_EMPTY 4
GLOBL mbr_magic, object
.word 0xaa55
END mbr_magic
mbr_end:
/*
* CHAPTER 5
*
* "Scratch and Stack Space"
*
* 0x7e00
*/
.section .bss
bss_start:
/* "Disk Address Packet" (for int13/42 "extended read") */
.align 8
LOCAL dap, object
/* 0 */ .byte 0 /* size of DAP (must be initialized to 0x10) */
/* 1 */ .byte 0 /* always 0 */
/* 2 */ .word 0 /* number of sectors to transfer (<= 127) */
/* 4 */ .word 0 /* destination offset (within segment, must be word aligned) */
/* 6 */ .word 0 /* destination segment */
/* 8 */ .quad 0 /* first LBA (48-bit) */
dap_end:
END dap
/*
* ATTENTION: this must stay the last .bss member because apparently some
* buggy BIOSes ignore the buffer size and write beyond the data structure
*/
.align 8
LOCAL drive_params, object
/* 00 */.word 0 /* size of buffer (must be initialized to 0x1a) */
/* 02 */.word 0 /* information flags (must be 0) */
/* 04 */.long 0 /* number of cylinders */
/* 08 */.long 0 /* number of heads */
/* 0c */.long 0 /* number of sectors per track */
/* 10 */.quad 0 /* number of total logical sectors */
/* 18 */.word 0 /* bytes per logical sector */
drive_params_end:
END drive_params
/* stack is initialized to 0x8000 */