Monday, September 25, 2017

Linux kernel debugging with GDB: getting a task running on a CPU

The current task is saved in per-cpu space for x86-64 and is accessed through the gs  register at current_task offset as
mov    %gs:0xd440,%rdx
(gdb) p/x &current_task
$63 = 0xd440
(gdb) p/x __per_cpu_offset[0]
$64 = 0xffff88001fc00000
(gdb) x/gx 0xffff88001fc00000+0xd440
0xffff88001fc0d440: 0xffff88001dea6a00
(gdb) p/d ((struct task_struct*)0xffff88001dea6a00)->pid
$67 = 243
(gdb) p/x ((struct task_struct*)0xffff88001dea6a00)->mm
$69 = 0xffff88001d1bc800
(gdb) p/x ((struct task_struct*)0xffff88001dea6a00)->active_mm
$70 = 0xffff88001d1bc800
(gdb) p/x __per_cpu_offset[2]
$73 = 0xffff88001fd00000
(gdb) x/gx 0xffff88001fd00000+0xd440
0xffff88001fd0d440: 0xffff88001f240000
(gdb) p/x ((struct task_struct*)0xffff88001f240000)->pid
$74 = 0x1
(gdb) lx-ps
0xffffffff81e104c0 <init_task> 0 swapper/0
0xffff88001f240000 1 systemd
0xffff88001f240d40 2 kthreadd
0xffff88001f2427c0 4 kworker/0:0H
0xffff88001f244240 6 mm_percpu_wq
0xffff88001f244f80 7 ksoftirqd/0
0xffff88001f245cc0 8 rcu_sched
0xffff88001f246a00 9 rcu_bh
0xffff88001f298000 10 migration/0
0xffff88001f298d40 11 watchdog/0
0xffff88001f29c240 12 cpuhp/0
0xffff88001f29cf80 13 cpuhp/1
0xffff88001f29dcc0 14 watchdog/1
0xffff88001f29ea00 15 migration/1
0xffff88001f2c8000 16 ksoftirqd/1
0xffff88001f2c9a80 18 kworker/1:0H
0xffff88001f2ca7c0 19 cpuhp/2
0xffff88001f2cb500 20 watchdog/2
0xffff88001f2cc240 21 migration/2
0xffff88001f2ccf80 22 ksoftirqd/2
0xffff88001f2cea00 24 kworker/2:0H
0xffff88001f310000 25 cpuhp/3
0xffff88001f310d40 26 watchdog/3
0xffff88001f311a80 27 migration/3
0xffff88001f3127c0 28 ksoftirqd/3
0xffff88001f314240 30 kworker/3:0H
0xffff88001f314f80 31 kdevtmpfs
0xffff88001f315cc0 32 netns
0xffff88001dc28000 34 khungtaskd
0xffff88001dc28d40 35 oom_reaper
0xffff88001dc29a80 36 writeback
0xffff88001dc2a7c0 37 kcompactd0
0xffff88001dc2b500 38 ksmd
0xffff88001dc2c240 39 crypto
0xffff88001dc2cf80 40 kintegrityd
0xffff88001dc2dcc0 41 bioset
0xffff88001dc2ea00 42 kblockd
0xffff88001dcd8000 43 ata_sff
0xffff88001dcd8d40 44 md
0xffff88001dcd9a80 45 edac-poller
0xffff88001dcda7c0 46 devfreq_wq
0xffff88001dcdb500 47 watchdogd
0xffff88001dcdc240 48 kworker/1:1
0xffff88001dcdcf80 49 kworker/2:1
0xffff88001dcddcc0 50 kworker/3:1
0xffff88001ddf8000 52 kauditd
0xffff88001ddf8d40 53 kswapd0
0xffff88001ddf9a80 54 bioset
0xffff88001ddfa7c0 55 ecryptfs-kthrea
0xffff88001dff0d40 72 kthrotld
0xffff88001dff1a80 73 acpi_thermal_pm
0xffff88001dff27c0 74 bioset
0xffff88001dff3500 75 bioset
0xffff88001dff4240 76 bioset
0xffff88001dff4f80 77 bioset
0xffff88001dff5cc0 78 bioset
0xffff88001dff6a00 79 bioset
0xffff88001dff0000 80 bioset
0xffff88001d660000 81 bioset
0xffff88001d660d40 82 scsi_eh_0
0xffff88001d661a80 83 scsi_tmf_0
0xffff88001d6627c0 84 scsi_eh_1
0xffff88001d663500 85 scsi_tmf_1
0xffff88001d718d40 91 ipv6_addrconf
0xffff88001d71dcc0 104 charger_manager
0xffff88001d71a7c0 105 bioset
0xffff88001d71ea00 106 bioset
0xffff88001d71c240 107 bioset
0xffff88001d719a80 110 jbd2/sda-8
0xffff88001d718000 111 ext4-rsv-conver
0xffff88001ddfdcc0 123 kworker/1:1H
0xffff88001ddfcf80 124 kworker/2:1H
0xffff88001ddfc240 127 kworker/0:1H
0xffff88001f350d40 135 kworker/3:2
0xffff88001dea4240 137 kworker/1:2
0xffff88001d0b0d40 140 systemd-journal
0xffff88001dea27c0 142 kworker/2:2
0xffff88001dea0d40 146 kworker/0:3
0xffff88001ded6a00 153 systemd-udevd
0xffff88001dea0000 156 kworker/3:1H
0xffff88001dea5cc0 227 cron
0xffff88001dea1a80 229 rsyslogd
0xffff88001ded0d40 235 in:imuxsock
0xffff88001ded0000 236 in:imklog
0xffff88001ded27c0 237 rs:main Q:Reg
0xffff88001ded1a80 233 agetty
0xffff88001c5d8d40 234 login
0xffff88001dea6a00 243 bash
0xffff88001dea3500 248 kworker/u8:2
0xffff88001c5d9a80 251 kworker/0:1
0xffff88001c5dc240 445 kworker/u8:1
0xffff88001c5ddcc0 452 kworker/u8:0

Sunday, August 13, 2017

Unwinding a kernel mode stack for exception in Linux.

Generally GDB is unable to unwind a kernel call stack with an exception frame on it. The unwinding stops on an exception processing. For example
(gdb) bt
#0  delay_tsc (__loops=5241148) at ../arch/x86/lib/delay.c:78
#1  0xffffffff8134b532 in __delay (loops=<optimised out>) at ../arch/x86/lib/delay.c:160
#2  __const_udelay (xloops=<optimised out>) at ../arch/x86/lib/delay.c:174
#3  0xffffffff81132620 in panic (fmt=<optimised out>) at ../kernel/panic.c:297
#4  0xffffffff8101f080 in oops_end (flags=70, regs=0xffffc90000227c18, signr=9) at ../arch/x86/kernel/dumpstack.c:235
#5  0xffffffff8104d1c7 in no_context (regs=0xffffc90000227c18, error_code=0, address=8, signal=<optimised out>, si_code=<optimised out>) at ../arch/x86/mm/fault.c:867
#6  0xffffffff8104d456 in __bad_area_nosemaphore (regs=0xffffc90000227c18, error_code=0, address=8, vma=<optimised out>, si_code=196609) at ../arch/x86/mm/fault.c:953
#7  0xffffffff8104d59f in bad_area_nosemaphore (regs=<optimised out>, error_code=<optimised out>, address=<optimised out>, vma=<optimised out>) at ../arch/x86/mm/fault.c:960
#8  0xffffffff8104d8e7 in __do_page_fault (regs=0xffffc90000227c18, error_code=0, address=8) at ../arch/x86/mm/fault.c:1387
#9  0xffffffff8104dccc in do_page_fault (regs=<optimised out>, error_code=<optimised out>) at ../arch/x86/mm/fault.c:1508
#10 0xffffffff8193ecd2 in page_fault () at ../arch/x86/entry/entry_64.S:1005
#11 0xffff88001decdb40 in ?? ()
#12 0xffff88001c02ae48 in ?? ()
#13 0xffffc90000227d98 in ?? ()
#14 0xffff88001d733000 in ?? ()
#15 0xffffc90000227cf0 in ?? ()
#16 0xffff88001d733000 in ?? ()
#17 0xffff88001cd02510 in ?? ()
#18 0xffffc90000227e18 in ?? ()
#19 0x0000000000000000 in ?? ()
Linux has a structure struct pt_regs to save thread context state. A pointer to this strucrue is provided to an exception processing routine and contains a context of a thread when an exception happened. Using register values from this structure a call stack at the moment of exception can be captured with GDB.
(gdb) f 8
#8  0xffffffff8104d8e7 in __do_page_fault (regs=0xffffc90000227c18, error_code=0, address=8) at ../arch/x86/mm/fault.c:1387
1387    bad_area_nosemaphore(regs, error_code, address, NULL);
Having a valid regs pointer set register values.
(gdb) p/x *regs
$2 = {r15 = 0xffff88001decdb40, r14 = 0xffff88001c02ae48, r13 = 0xffffc90000227d98, r12 = 0xffff88001d733000, bp = 0xffffc90000227cf0, bx = 0xffff88001d733000, r11 = 0xffff88001cd02510, 
  r10 = 0xffffc90000227e18, r9 = 0x0, r8 = 0xffff88001fc9d180, ax = 0x0, cx = 0x0, dx = 0x1000, si = 0xffff88001d733000, di = 0xffffc90000227d00, orig_ax = 0xffffffffffffffff, ip = 0xffffffff811aa30c, 
  cs = 0x10, flags = 0x246, sp = 0xffffc90000227cc8, ss = 0x18}
(gdb) set $rsp=0xffffc90000227cc8
(gdb) set $rip=0xffffffff811aa30c.
(gdb) set $rbp=0xffffc90000227cf0
(gdb) set $rbx=0xffff88001d733000
(gdb) set $r15=0xffff88001decdb40
(gdb) set $r14=0xffff88001c02ae48
(gdb) set $r13=0xffffc90000227d98
(gdb) set $r12=0xffff88001d733000
(gdb) set $r11=0xffff88001cd02510
(gdb) set $r10=0xffffc90000227e18
(gdb) set $r9=0
(gdb) set $rsi=0xffff88001d733000
(gdb) set $rdi=0xffffc90000227d00
Now a call stack at the momemnt of exception can be examined.
(gdb) bt
#0  __read_once_size (size=<optimised out>, res=<optimised out>, p=<optimised out>) at ../include/linux/compiler.h:254
#1  __read_seqcount_begin (s=<optimised out>) at ../include/linux/seqlock.h:112
#2  raw_read_seqcount_begin (s=<optimised out>) at ../include/linux/seqlock.h:147
#3  read_seqcount_begin (s=<optimised out>) at ../include/linux/seqlock.h:164
#4  get_fs_root_rcu (root=<optimised out>, fs=<optimised out>) at ../fs/dcache.c:3222
#5  d_path (path=0xffffc90000227d00, buf=0xffff88001d733000 "", buflen=4096) at ../fs/dcache.c:3265
#6  0xffffffffc0000076 in redirfs_get_filename ()
#7  0xffffffffc0014121 in dummyflt_release (context=<optimised out>, args=0xffffc90000227d98) at /work/redirfs/src/dummyflt/dummyflt.c:104
#8  0xffffffffc000892e in rfs_precall_flts ()
#9  0xffffffffc0002a42 in rfs_release ()
#10 0xffffffff81193a7a in __fput (file=0xffff88001cd02500) at ../fs/file_table.c:209
#11 0xffffffff81193bb9 in ____fput (work=<optimised out>) at ../fs/file_table.c:245
#12 0xffffffff810758b9 in task_work_run () at ../kernel/task_work.c:116
#13 0xffffffff8105da35 in exit_task_work (task=<optimised out>) at ../include/linux/task_work.h:21
#14 do_exit (code=<optimised out>) at ../kernel/exit.c:878
#15 0xffffffff8105f14e in do_group_exit (exit_code=0) at ../kernel/exit.c:982
#16 0xffffffff8105f1bf in SYSC_exit_group (error_code=<optimised out>) at ../kernel/exit.c:993
#17 SyS_exit_group (error_code=<optimised out>) at ../kernel/exit.c:991
#18 0xffffffff8193d060 in entry_SYSCALL_64 () at ../arch/x86/entry/entry_64.S:203
#19 0x0000000000000000 in ?? ()

Saturday, July 29, 2017

Windows developers' misconception about UNIX.

While reading osronline.com forum on Windows file system development I ran into a common misconception among Windows developers regarding UNIX design. http://osronline.com/cf.cfm?PageURL=showThread.CFM?link=285260
<QUOTE>
The essential difference between how the NT kernel works and how Unix was
designed is that NT caches streams of data (above the file system), whereas
on Unix data is cached at the block layer.
</QUOTE>
I spent 5 minutes to bust it.
This is true only for ancient *NIX kernels. Modern kernels use the same technique as NT with caching backed by file mapping structures.
For example below is a call stack from my test machine running the Linux kernel (4.12.2) when ext4 read operation (ext4_file_read_iter) called the "Linux cache manager" ( do_generic_file_read -> page_cache_sync_readahead ) to bring data in the cache backed by mapped file structures( struct address_space ) when processing the read() system call.
This resulted in a recursive call to mapping->a_ops->readpages into a file system's ext4_readpages . This is an analogue of a cached read in NT. Mac OS X uses the same caching by file mapping technique borrowed from BSD.
(gdb) bt
#0  ext4_readpages (file=0xffff88001d59b300, mapping=0xffff88001d1d56c0, pages=0xffffc90000817c30, nr_pages=1) at ../fs/ext4/inode.c:3308
#1  0xffffffff811b6288 in read_pages (gfp=<optimised out>, nr_pages=<optimised out>, pages=<optimised out>, filp=<optimised out>, mapping=<optimised out>) at ../mm/readahead.c:121
#2  __do_page_cache_readahead (mapping=<optimised out>, filp=<optimised out>, offset=1, nr_to_read=<optimised out>, lookahead_size=<optimised out>) at ../mm/readahead.c:199
#3  0xffffffff811b64b8 in ra_submit (ra=<optimised out>, ra=<optimised out>, ra=<optimised out>, filp=<optimised out>, mapping=<optimised out>) at ../mm/internal.h:66
#4  ondemand_readahead (mapping=0xffff88001d1d56c0, ra=0xffff88001d59b398, filp=0xffff88001d59b300, hit_readahead_marker=<optimised out>, offset=0, req_size=<optimised out>) at ../mm/readahead.c:478
#5  0xffffffff811b678e in page_cache_sync_readahead (mapping=<optimised out>, ra=<optimised out>, filp=<optimised out>, offset=<optimised out>, req_size=<optimised out>) at ../mm/readahead.c:510
#6  0xffffffff811a7a62 in do_generic_file_read (written=<optimised out>, iter=<optimised out>, ppos=<optimised out>, filp=<optimised out>) at ../mm/filemap.c:1813
#7  generic_file_read_iter (iocb=0x20000, iter=<optimised out>) at ../mm/filemap.c:2069
#8  0xffffffff812d1386 in ext4_file_read_iter (iocb=0xffff88001d59b300, to=0xffff88001d1d56c0) at ../fs/ext4/file.c:70
#9  0xffffffff81237680 in call_read_iter (file=<optimised out>, iter=<optimised out>, kio=<optimised out>) at ../include/linux/fs.h:1728
#10 new_sync_read (ppos=<optimised out>, len=<optimised out>, buf=<optimised out>, filp=<optimised out>) at ../fs/read_write.c:440
#11 __vfs_read (file=0xffff88001d59b300, buf=<optimised out>, count=<optimised out>, pos=0xffffc90000817f18) at ../fs/read_write.c:452
#12 0xffffffff81237cc3 in vfs_read (file=0xffff88001d59b300, buf=0x7fb92a0cb000 <error: Cannot access memory at address 0x7fb92a0cb000>, count=<optimised out>, pos=0xffffc90000817f18)
    at ../fs/read_write.c:473
#13 0xffffffff81239385 in SYSC_read (count=<optimised out>, buf=<optimised out>, fd=<optimised out>) at ../fs/read_write.c:589
#14 SyS_read (fd=<optimised out>, buf=140433251151872, count=131072) at ../fs/read_write.c:582
#15 0xffffffff818aaffb in entry_SYSCALL_64 () at ../arch/x86/entry/entry_64.S:203

(gdb) f 4
#4  ondemand_readahead (mapping=0xffff88001d1d56c0, ra=0xffff88001d59b398, filp=0xffff88001d59b300, hit_readahead_marker=<optimised out>, offset=0, req_size=<optimised out>) at ../mm/readahead.c:478
478  return ra_submit(ra, mapping, filp);

(gdb) p/x *mapping
$14 = {host = 0xffff88001d1d5548, page_tree = {gfp_mask = 0x1180020, rnode = 0x0}, tree_lock = {{rlock = {raw_lock = {val = {counter = 0x0}}}}}, i_mmap_writable = {counter = 0x0}, i_mmap = {
    rb_node = 0x0}, i_mmap_rwsem = {count = {counter = 0x0}, wait_list = {next = 0xffff88001d1d56f0, prev = 0xffff88001d1d56f0}, wait_lock = {raw_lock = {val = {counter = 0x0}}}, osq = {tail = {
        counter = 0x0}}, owner = 0x0}, nrpages = 0x0, nrexceptional = 0x0, writeback_index = 0x0, a_ops = 0xffffffff81a3a680, flags = 0x0, private_lock = {{rlock = {raw_lock = {val = {
            counter = 0x0}}}}}, gfp_mask = 0x14200ca, private_list = {next = 0xffff88001d1d5740, prev = 0xffff88001d1d5740}, private_data = 0x0}
            
(gdb) ptype mapping
type = struct address_space {
    struct inode *host;
    struct radix_tree_root page_tree;
    spinlock_t tree_lock;
    atomic_t i_mmap_writable;
    struct rb_root i_mmap;
    struct rw_semaphore i_mmap_rwsem;
    unsigned long nrpages;
    unsigned long nrexceptional;
    unsigned long writeback_index;
    const struct address_space_operations *a_ops;
    unsigned long flags;
    spinlock_t private_lock;
    gfp_t gfp_mask;
    struct list_head private_list;
    void *private_data;
} *

(gdb) f 1
#1  0xffffffff811b6288 in read_pages (gfp=<optimised out>, nr_pages=<optimised out>, pages=<optimised out>, filp=<optimised out>, mapping=<optimised out>) at ../mm/readahead.c:121
121   ret = mapping->a_ops->readpages(filp, mapping, pages, nr_pages);
(gdb) l
116  int ret;
117 
118  blk_start_plug(&plug);
119 
120  if (mapping->a_ops->readpages) {
121   ret = mapping->a_ops->readpages(filp, mapping, pages, nr_pages);
122   /* Clean up the remaining pages */
123   put_pages_list(pages);
124   goto out;
125  }

(gdb) f 9
#9  0xffffffff81237680 in call_read_iter (file=<optimised out>, iter=<optimised out>, kio=<optimised out>) at ../include/linux/fs.h:1728
1728  return file->f_op->read_iter(kio, iter);
(gdb) l
1723 } ____cacheline_aligned;
1724 
1725 static inline ssize_t call_read_iter(struct file *file, struct kiocb *kio,
1726          struct iov_iter *iter)
1727 {
1728  return file->f_op->read_iter(kio, iter);
1729 }
1730 
1731 static inline ssize_t call_write_iter(struct file *file, struct kiocb *kio,
1732           struct iov_iter *iter)
(gdb) 

Monday, July 3, 2017

FltCreateFile and top device.

FltCreateFile calls IoCreateFileEx with IO_DRIVER_CREATE_CONTEXT.DeviceObjectHint pointing to the Filter Manager's filter object and then calls the lower registered filters. That allows the created file object to have TopDeviceObjectHint pointing to the Filter Manager's object.
 # Child-SP          RetAddr           Call Site
00 ffffe101`3fffebf8 fffff801`92125200 FLTMGR!FltpCreate
01 ffffe101`3fffec00 fffff801`9213058b nt!IopParseDevice+0x7f0
02 ffffe101`3fffedd0 fffff801`921340c0 nt!ObpLookupObjectName+0x46b
03 ffffe101`3fffefa0 fffff801`9213803a nt!ObOpenObjectByNameEx+0x1e0
04 ffffe101`3ffff0e0 fffff801`920b0eb4 nt!IopCreateFile+0x3aa
05 ffffe101`3ffff180 fffff808`485240d5 nt!IoCreateFileEx+0x124
06 ffffe101`3ffff210 fffff808`4853d32d FLTMGR!FltpCreateFile+0x1cd
07 ffffe101`3ffff310 fffff808`4b6a79f8 FLTMGR!FltCreateFile+0x8d
08 ffffe101`3ffff3a0 fffff808`484f4b4c avscan!AvPreCreate+0x378 [d:\work\avscan\filter\avscan.c @ 2106]
09 ffffe101`3ffff4b0 fffff808`484f46ec FLTMGR!FltpPerformPreCallbacks+0x2ec
0a ffffe101`3ffff5d0 fffff808`48526117 FLTMGR!FltpPassThroughInternal+0x8c
0b ffffe101`3ffff600 fffff801`92125200 FLTMGR!FltpCreate+0x2d7
0c ffffe101`3ffff6b0 fffff801`9213058b nt!IopParseDevice+0x7f0
0d ffffe101`3ffff880 fffff801`921340c0 nt!ObpLookupObjectName+0x46b
0e ffffe101`3ffffa50 fffff801`920c9e90 nt!ObOpenObjectByNameEx+0x1e0

0: kd> dt nt!_FILE_OBJECT ffff948c621a3330
   +0x000 Type             : 0n5
   +0x002 Size             : 0n216
   +0x008 DeviceObject     : 0xffff948c`60a3bc80 _DEVICE_OBJECT
   +0x010 Vpb              : 0xffff948c`60a556e0 _VPB
   +0x018 FsContext        : 0xffff948c`6111a740 Void
   +0x020 FsContext2       : 0xffff8483`76cf78f0 Void
   +0x028 SectionObjectPointer : (null) 
   +0x030 PrivateCacheMap  : (null) 
   +0x038 FinalStatus      : 0n0
   +0x040 RelatedFileObject : (null) 
   +0x048 LockOperation    : 0 ''
   +0x049 DeletePending    : 0 ''
   +0x04a ReadAccess       : 0x1 ''
   +0x04b WriteAccess      : 0 ''
   +0x04c DeleteAccess     : 0 ''
   +0x04d SharedRead       : 0x1 ''
   +0x04e SharedWrite      : 0x1 ''
   +0x04f SharedDelete     : 0x1 ''
   +0x050 Flags            : 0x40000
   +0x058 FileName         : _UNICODE_STRING "\"
   +0x068 CurrentByteOffset : _LARGE_INTEGER 0x0
   +0x070 Waiters          : 0
   +0x074 Busy             : 0
   +0x078 LastLock         : (null) 
   +0x080 Lock             : _KEVENT
   +0x098 Event            : _KEVENT
   +0x0b0 CompletionContext : (null) 
   +0x0b8 IrpListLock      : 0
   +0x0c0 IrpList          : _LIST_ENTRY [ 0xffff948c`621a33f0 - 0xffff948c`621a33f0 ]
   +0x0d0 FileObjectExtension : 0xffff948c`6226f1b0 Void
   
0: kd> dq 0xffff948c`6226f1b0
ffff948c`6226f1b0  00000000`00000000 00000000`00000000
ffff948c`6226f1c0  ffff948c`60dff0d0 00000000`00000000
ffff948c`6226f1d0  ffff948c`6243f2c0 00000000`00000000
ffff948c`6226f1e0  00000000`00000000 00000000`00000000
ffff948c`6226f1f0  00000000`00000000 00000000`00000000
ffff948c`6226f200  61436d4d`02120006 00000000`0000034c
ffff948c`6226f210  ffff8483`7535ed10 ffff948c`62161e28
ffff948c`6226f220  ffff948c`6221da78 00000000`00000000

0: kd> dq ffff948c`60dff0d0
ffff948c`60dff0d0  ffff948c`610734a0 00000000`00000000
ffff948c`60dff0e0  00000000`00000000 00000000`00000000
ffff948c`60dff0f0  65536d4d`02060003 6c8da38a`069a7123
ffff948c`60dff100  00000000`00000000 0000024e`49c8000a
ffff948c`60dff110  0000024e`49c80fff 00000000`00000000
ffff948c`60dff120  00000000`00000000 00000000`00000000
ffff948c`60dff130  00000000`00000000 00000000`00000000
ffff948c`60dff140  00000000`00000002 00000000`00000000

0: kd> !object ffff948c`610734a0
Object: ffff948c610734a0  Type: (ffff948c5e34eb00) Device
    ObjectHeader: ffff948c61073470 (new version)
    HandleCount: 0  PointerCount: 1
    
0: kd> !devstack ffff948c610734a0
  !DevObj           !DrvObj            !DevExt           ObjectName
> ffff948c610734a0  \FileSystem\FltMgr ffff948c610735f0  
  ffff948c61048060  \FileSystem\fastfatffff948c610481b0 
  
0: kd> !vpb 0xffff948c`60a556e0
Vpb at 0xffff948c60a556e0
Flags: 0x1 mounted 
DeviceObject: 0xffff948c61048060
RealDevice:   0xffff948c60a3bc80
RefCount: 8
Volume Label: 

Tuesday, June 27, 2017

Sunday, June 25, 2017

Magenta RISC-V booting

Below is an output of Magenta RISC-V port riscv-magenta

[1266874889.709] 00000.00000> Available physical memory: 2032dMB
[1266874889.709] 00000.00000> 
welcome to lk/MP

[1266874889.709] 00000.00000> INIT: cpu 0, calling hook 0xffffffff8006e2d4 (global_prng_seed) at level 0x30000, flags 0x1
[1266874889.709] 00000.00000> WARNING: System has insufficient randomness.  It is completely unsafe to use this system for any cryptographic applications.
[1266874889.709] 00000.00000> INIT: cpu 0, calling hook 0xffffffff800386b0 (elf_build_id) at level 0x3fffe, flags 0x1
[1266874889.709] 00000.00000> INIT: cpu 0, calling hook 0xffffffff800385f0 (version) at level 0x3ffff, flags 0x1
[1266874889.709] 00000.00000> version:
[1266874889.709] 00000.00000>  arch:     RISCV
[1266874889.709] 00000.00000>  platform: RISCV_RV64
[1266874889.709] 00000.00000>  target:   QEMU_RISCV_RV64
[1266874889.709] 00000.00000>  project:  MAGENTA_QEMU_RISCV_RV64
[1266874889.709] 00000.00000>  buildid:  GIT_0FBB50D3A8FC1D242E1D7A2921674579C9192D66_DIRTY_LOCAL
[1266874889.709] 00000.00000>  ELF build ID: be2909891fc4cce74084f5fa3f14d6e98903d3bb
[1266874889.709] 00000.00000> INIT: cpu 0, calling hook 0xffffffff80042f0c (vm_preheap) at level 0x3ffff, flags 0x1
[1266874889.709] 00000.00000> initializing heap
[1266874889.709] 00000.00000> INIT: cpu 0, calling hook 0xffffffff8004305c (vm) at level 0x50000, flags 0x1
[1266874889.709] 00000.00000> VM: reserving kernel region [ffffffff80000000, ffffffff800f7000) flags 0x28 name 'kernel_code'
[1266874889.709] 00000.00000> VM: reserving kernel region [ffffffff800f7000, ffffffff80139000) flags 0x8 name 'kernel_rodata'
[1266874889.709] 00000.00000> VM: reserving kernel region [ffffffff80139000, ffffffff8013c000) flags 0x18 name 'kernel_data'
[1266874889.709] 00000.00000> VM: reserving kernel region [ffffffff8013c000, ffffffff80166000) flags 0x18 name 'kernel_bss'
[1266874889.709] 00000.00000> VM: reserving kernel region [ffffffff8016a000, ffffffff8114b000) flags 0x18 name 'kernel_bootalloc'
[1266874889.709] 00000.00000> INIT: cpu 0, calling hook 0xffffffff8000197c (timer) at level 0x50003, flags 0x1
[00003.966] 00000.00000> initializing mp
[00003.966] 00000.00000> initializing threads
[00003.967] 00000.00000> initializing timers
[00003.967] 00000.00000> INIT: cpu 0, calling hook 0xffffffff800132d4 (debuglog) at level 0x6ffff, flags 0x1
[00003.970] 00000.00000> INIT: cpu 0, calling hook 0xffffffff8006e448 (global_prng_thread_safe) at level 0x6ffff, flags 0x1
[00003.971] 00000.00000> creating bootstrap completion thread
[00004.057] 00000.00000> top of bootstrap2()
[00004.058] 00000.00000> INIT: cpu 0, calling hook 0xffffffff80070c9c (dpc) at level 0x70000, flags 0x1
[00004.064] 00000.00000> INIT: cpu 0, calling hook 0xffffffff8009158c (magenta) at level 0x70000, flags 0x1
[00004.078] 00000.00000> initializing platform
[00004.078] 00000.00000> initializing target
[00004.078] 00000.00000> calling apps_init()
[00004.078] 00000.00000> INIT: cpu 0, calling hook 0xffffffff80014464 (ktrace) at level 0xaffff, flags 0x1
[00005.091] 00000.00000> ktrace: buffer at 0xffffffc000e01000 (33554432 bytes)
[00005.094] 00000.00000> INIT: cpu 0, calling hook 0xffffffff800384bc (userboot) at level 0xaffff, flags 0x1
[00005.161] 00000.00000> userboot: userboot rodata       0 @ [0x292f043000,0x292f045000)
[00005.163] 00000.00000> userboot: userboot code    0x2000 @ [0x292f045000,0x292f05a000)
[00005.165] 00000.00000> userboot: vdso/full rodata       0 @ [0x292f05a000,0x292f05f000)
[00005.167] 00000.00000> userboot: vdso/full code    0x5000 @ [0x292f05f000,0x292f061000)
[00005.193] 00000.00000> userboot: entry point             @ 0x292f046da0
[00005.293] 00000.00000> starting app shell
] [00005.296] 00000.00000> entering main console loop

Monday, June 19, 2017

RISC-V Magenta context switch


The code related to RISC-V Magenta context switching can be found in exception.S  and switch_to.h files.
A thread for a CPU is switched by a call to
static inline void riscv_switch_to(struct riscv_thread_state * prev,
                                   struct riscv_thread_state * next)
{
    __switch_to_aux(prev, next);
    __switch_to(prev, next);
    assert(get_current());
}
The registers are saved and restored from the riscv_thread_state structure.
/*
 * Integer register context switch
 * The callee-saved registers must be saved and restored.
 * 
 *   a0: previous task_struct (must be preserved across the switch)
 *   a1: next task_struct
 */
.section .text
FUNCTION(__switch_to)
    /*
    * $a0 == &prev->arch.stat
    * $a1 == &next->arch.stat
    */
    /* Save context into prev->arch.state */
    REG_S ra,  THREAD_RA(a0)
    REG_S sp,  THREAD_SP(a0)
    REG_S s0,  THREAD_S0(a0)
    REG_S s1,  THREAD_S1(a0)
    REG_S s2,  THREAD_S2(a0)
    REG_S s3,  THREAD_S3(a0)
    REG_S s4,  THREAD_S4(a0)
    REG_S s5,  THREAD_S5(a0)
    REG_S s6,  THREAD_S6(a0)
    REG_S s7,  THREAD_S7(a0)
    REG_S s8,  THREAD_S8(a0)
    REG_S s9,  THREAD_S9(a0)
    REG_S s10, THREAD_S10(a0)
    REG_S s11, THREAD_S11(a0)
    /* Restore context from next->arch.state */
    REG_L ra,  THREAD_RA(a1)
    REG_L sp,  THREAD_SP(a1)
    REG_L s0,  THREAD_S0(a1)
    REG_L s1,  THREAD_S1(a1)
    REG_L s2,  THREAD_S2(a1)
    REG_L s3,  THREAD_S3(a1)
    REG_L s4,  THREAD_S4(a1)
    REG_L s5,  THREAD_S5(a1)
    REG_L s6,  THREAD_S6(a1)
    REG_L s7,  THREAD_S7(a1)
    REG_L s8,  THREAD_S8(a1)
    REG_L s9,  THREAD_S9(a1)
    REG_L s10, THREAD_S10(a1)
    REG_L s11, THREAD_S11(a1)
    REG_L tp,  THREAD_TI(a1) /* Next thread_info pointer */
    /*return to $ra, the new $sp has been set*/
    ret
END(__switch_to)
The new thread $ra points to the next instruction after a call to __switch_to inside riscv_switch_to
(gdb) p/x newthread->arch.state
$137 = {ra = 0xffffffff80002f3c, sp = 0xffffffff8114cc80, s = {0xffffffff8114ccb0, 0xffffffff800031c4, 0x1, 0xffffffff800347f8, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}, fstate = {f = {
      0x0 <repeats 32 times>}, fcsr = 0x0}, ti = 0xffffffff8114ae80}
The ret instruction at the end of __switch_to resumes a thread at this address.
The scheduled out thread state has $ra pointing to same address.
(gdb) p/x oldthread->arch.state
$138 = {ra = 0xffffffff8000b3f8, sp = 0xffffffc002e04000, s = {0x0 <repeats 12 times>}, fstate = {f = {0x0 <repeats 32 times>}, fcsr = 0x0}, ti = 0xffffffff8116a9d8}
The riscv_switch_to disassembled listing is
(gdb) disassem 0xffffffff80002f3c
Dump of assembler code for function riscv_switch_to:
   0xffffffff80002f04 <+0>: addi sp,sp,-48
   0xffffffff80002f08 <+4>: sd ra,40(sp)
   0xffffffff80002f0c <+8>: sd s0,32(sp)
   0xffffffff80002f10 <+12>: sd s1,24(sp)
   0xffffffff80002f14 <+16>: addi s0,sp,48
   0xffffffff80002f18 <+20>: mv s1,ra
   0xffffffff80002f1c <+24>: sd a0,-40(s0)
   0xffffffff80002f20 <+28>: sd a1,-48(s0)
   0xffffffff80002f24 <+32>: ld a1,-48(s0)
   0xffffffff80002f28 <+36>: ld a0,-40(s0)
   0xffffffff80002f2c <+40>: jal ra,0xffffffff80002ee0 <__switch_to_aux>
   0xffffffff80002f30 <+44>: ld a1,-48(s0)
   0xffffffff80002f34 <+48>: ld a0,-40(s0)
   0xffffffff80002f38 <+52>: jal ra,0xffffffff800003a8 <__switch_to>
   0xffffffff80002f3c <+56>: jal ra,0xffffffff80002e70 <get_current>
   0xffffffff80002f40 <+60>: mv a5,a0
   0xffffffff80002f44 <+64>: seqz a5,a5
   0xffffffff80002f48 <+68>: andi a5,a5,255
   0xffffffff80002f4c <+72>: beqz a5,0xffffffff80002f78 <riscv_switch_to+116>
   0xffffffff80002f50 <+76>: mv a0,s1
   0xffffffff80002f54 <+80>: mv a1,s0
   0xffffffff80002f58 <+84>: lui a5,0x800f2
   0xffffffff80002f5c <+88>: addi a5,a5,1880 # 0xffffffff800f2758
   0xffffffff80002f60 <+92>: li a4,34
   0xffffffff80002f64 <+96>: lui a3,0x800f2
   0xffffffff80002f68 <+100>: addi a3,a3,1896 # 0xffffffff800f2768
   0xffffffff80002f6c <+104>: lui a2,0x800f2
   0xffffffff80002f70 <+108>: addi a2,a2,1840 # 0xffffffff800f2730
   0xffffffff80002f74 <+112>: jal ra,0xffffffff8006b9f0 <_panic>
   0xffffffff80002f78 <+116>: nop
   0xffffffff80002f7c <+120>: ld ra,40(sp)
   0xffffffff80002f80 <+124>: ld s0,32(sp)
   0xffffffff80002f84 <+128>: ld s1,24(sp)
   0xffffffff80002f88 <+132>: addi sp,sp,48
   0xffffffff80002f8c <+136>: ret
As you can see 0xffffffff80002f3c is an address of a call to get_current which is a parameter to the assert check after a call to __switch_to.