Part 2: Advanced Static Analysis
Part 2: Advanced Static Analysis
m.c a.c
int e=7; extern int e;
main() .text
a()
main() .text
m.o
int e = 7 .data more system code
system data
int e = 7 .data
a() .text int *ep = &e
int x = 15
a.o int *ep = &e .data int y .bss
int x = 15
.bss .symtab
int y
.debug
Program execution
Operating system provides
Protection and resource allocation
Abstract view of resources (files, system calls)
Virtual memory
Uniform memory space abstraction for each process
Gives the illusion that each process has entire
memory space
How does a program get loaded?
The operating system creates a new process.
Including among other things, a virtual memory space
System loader
Loads the executable file from the file system into the
memory space
Done via DMA (direct memory access)
Executable contains code and statically link libraries
Executable in file system remains and can be executed again
Loads dynamic shared objects/libraries into memory space
Done via DMA from file system as with original executable
Resolves addresses in code (using .rel.text and .rel.data
information) based on where code/data is loaded
Starts a thread of execution running based on specified
entry point in ELF/PE header
Loading Executable Binaries
Executable object file for
example program p
0
ELF header
Virtual addr
Process image
Program header table
(required for executables) 0x080483e0
init and shared lib
.text section segments
.data section
0x08048494
.bss section .text segment
(r/o)
.symtab
.rel.text 0x0804a010
.data segment
.rel.data (initialized r/w)
.debug
0x0804a3b0
Section header table .bss segment
(required for relocatables) (uninitialized r/w)
Example: Linux virtual memory space (32-bit)
0xffffffff
kernel virtual memory memory
(code, data, heap, stack) invisible to
0xc0000000 user code
user stack
(created at runtime)
%esp (stack pointer)
brk
run-time heap
(managed by malloc)
read/write segment
(.data, .bss)
loaded from the
read-only segment executable file
(.init, .text, .rodata)
0x08048000
unused
0
cat /proc/self/maps
Relocation
Virtual memory abstraction makes compilation and linking easy
Compared to a single, shared real memory address space (e.g. original
Mac)
Linker statically binds all program code and data to absolute virtual
addresses
Linker decides entire memory layout at compile time
Example: Windows ".com" format effectively a memory image
Issues
Support dynamic libraries to avoid statically linking things like libc into all
processes.
Dynamic libraries might want to be loaded at the same address!
Need to support relative addressing and relocation again
Want to support address-space layout randomization
Security defense mechanism requiring everything to be relocatable
What Meltdown/Spectre malware might attack first
More on relocation
Relocation in Windows PE (.exe) and Linux ELF
Requires position-independent code
Compiler makes all jumps and branches relative to current
location or relative to a base register set at run-time
Compiler labels any accesses to absolute addresses and has
loader rewrite them to their actual run-time values
Compiler uses indirection and dynamically generated offset
tables to determine addresses
Example: Procedure Link and Global Offset Tables in ELF
GOT contains addresses where imported library calls are loaded at run-
time
Library calls index GOT to determine location to jump to
Note: Can be targetted by malware for hooks!
Program execution
CPU Memory
Addresses
Registers Object Code
E Data Program Data
I OS Data
P Condition Instructions
Codes
%cx
%ecx %ch %cl
%dx
General purpose %edx %dh %dl
registers (mostly)
%bx
%ebx %bh %bl
%esi %si
%edi %di
Scaled Index
movl (%ecx, %edx, 4), %eax
x86 instructions
Rules
Source operand can be memory, register or
constant
Destination can be memory or register
Only one of source and destination can be memory
Source and destination must be same size
What’s the "l" for on the end?
movl 8(%ebp),%eax
It stands for “long” and is 32-bits
Size of the operands
Baggage from the days of 16-bit processors
if statements int x = 1;
int y = 2;
if (x==y)
printf("x equals y.\n");
else
printf("x is not equal to y.\n");
}
080483c4 <f>: int main() { f();}
80483c4: pushl %ebp
80483c5: movl %esp,%ebp
80483c7: subl $0x18,%esp
80483ca: movl $0x1,-0x8(%ebp)
80483d1: movl $0x2,-0x4(%ebp)
80483d8: movl -0x8(%ebp),%eax
80483db: cmpl -0x4(%ebp),%eax
80483de: jne 80483ee <f+0x2a>
80483e0: movl $0x80484f0,(%esp)
80483e7: call 80482d8 <puts@plt>
80483ec: jmp 80483fa <f+0x36>
80483ee: movl $0x80484fc,(%esp)
80483f5: call 80482d8 <puts@plt>
80483fa: leave
80483fb: ret
if statements
Note: Microsoft assembly and reverse operand order
int a = 1, b = 3, c;
if (a > b)
c = a;
else
c = b;
factorial_do:
pushl %ebp
movl %esp, %ebp
movl 8(%ebp), %edx
movl $1, %eax
.L2:
imull %edx, %eax
decl %edx
cmpl $1, %edx
jg .L2
leave
ret
C switch statements
switch (x) {
case 1:
case 5:
code at L0
case 2:
case 3:
code at L1
default:
code at L2
}
C switch statements
Implementation options
Series of conditionals
testl followed by je
OK if few cases and large ranges of values
Slow if many cases
Jump table (example below)
Lookup branch target from a table
Possible with a small range of integer constants
GCC picks implementation based on structure
Example:switch (x) {
case 1: .L3
case 5: .L2
code at L0
case 2:
.L0 1. init jump table at .L3
case 3: .L1 2. get address at .L3+4*x
code at L1 .L1 3. jump to that address
default: .L2
code at L2
} .L0
Example int switch_eg(int x)
{
int result = x;
switch (x) {
case 100:
result *= 13;
break;
case 102:
result += 10;
/* Fall through */
case 103:
result += 11;
break;
case 104:
case 106:
result *= result;
break;
default:
result = 0;
}
return result;
}
int switch_eg(int x)
{ leal -100(%edx),%eax .L6:
cmpl $6,%eax addl $11,%edx
int result = x; ja .L9
switch (x) { jmp .L3
jmp *.L10(,%eax,4) .p2align 4,,7
case 100: .p2align 4,,7 .L8:
result *= 13; .section .rodata imull %edx,%edx
break; .align 4 jmp .L3
.align 4 .p2align 4,,7
.L10: .L9:
case 102: .long .L4 xorl %edx,%edx
result += 10; .long .L9 .L3:
/* Fall through */ .long .L5 movl %edx,%eax
.long .L6 leave
.long .L8 ret
case 103: .long .L9
result += 11; .long .L8
break; .text
.p2align 4,,7
.L4:
case 104: leal (%edx,%edx,2),%eax
case 106: leal (%edx,%eax,4),%edx
result *= result; jmp .L3
break; .p2align 4,,7
.L5:
addl $10,%edx
default:
result = 0;
}
return result;
} Key is jump table at L10
Array of pointers to jump locations
37
Avoiding conditional branches
Modern CPUs with deep pipelines
Instructions fetched far in advance of execution
Mask the latency going to memory
Problem: What if you hit a conditional branch?
Must predict which branch to take and speculatively
fetch/execute!
Branch prediction in CPUs well-studied, fairly
effective (except when it's not… ) (1/2018)
But, best to avoid conditional branching altogether
x86 REP prefixes
Loops require decrement, comparison, and conditional
branch for each iteration
Incur branch prediction penalty and overhead even for trivial
loops
Repeat instruction prefixes (REP, REPE, REPNE)
Inserted just before some instructions (movsb, movsw,
movsd, cmpsb, cmpsw, cmpsd)
REP (repeat for fixed count)
Direction flag (DF) set via cld and std instructions
esi and edi contain pointers to arguments
ecx contains counts
REPE (repeat until zero), REPNE (repeat until not zero)
Used in conjuntion with cmpsb, cmpsw, cmpsd
x86 REP example
.data
source DWORD 20 DUP (?)
target DWORD 20 DUP (?)
.code
cld ; clear direction flag = forward
mov ecx, LENGTHOF source
mov esi, OFFSET source
mov edi, OFFSET target
rep movsd
x86 SCAS
Repeat a search until a condition is met
SCASB SCASW SCASD
Search for a specific element in an array
Search for the first element that does not
match a given value
x86 SCAS
.data
.code
mov edi,OFFSET alpha
mov al,'F' ; search for 'F'
mov ecx,LENGTHOF alpha
cld
repne scasb ; repeat while not equal
jnz quit
dec edi ; EDI points to 'F'
x86-64 Conditionals
Conditional instruction execution
cmovXX src, dest
Move value from src to dest if condition XX holds
No branching
Conditional handled as operation within Execution Unit
Added with P6 microarchitecture (PentiumPro onward)
Must ensure gcc compiles with proper target to use
Example (x < y) ? (x) : (y)
movl 8(%ebp),%edx # Get x
movl 12(%ebp),%eax # rval=y
cmpl %edx, %eax # rval:x
cmovll %edx,%eax # If <, rval=x
Performance
14 cycles on all data
More efficient than conditional branching (simple control flow)
But overhead: both branches are evaluated
x86-64 conditional example
Stack Grows
Stack Down
Pointer
%esp
Stack “Top”
IA32 Stack Pushing
Stack “Bottom”
Pushing
pushl Src
Increasing
Decrement %esp by 4 Addresses
Fetch operand at Src
Write operand at
address given by
%esp
e.g. pushl %eax Stack Grows
subl $4, %esp Down
Stack
movl %eax,(%esp) Pointer
%esp -4
Stack “Top”
IA32 Stack Popping
Stack “Bottom”
Popping
popl Dest
Increasing
Read operand at Addresses
address given by
%esp
Write to Dest
Increment %esp by 4
e.g. popl %eax Stack
Stack Grows
Pointer
movl (%esp),%eax %esp Down
+4
addl $4,%esp
Stack “Top”
Stack Operation Examples
Initially pushl %eax popl %edx
call 8048b90
0x110 0x110
0x10c 0x10c
0x108 123 0x108 123
0x104 0x8048553
ret
0x110 0x110
0x10c 0x10c
0x108 123 0x108 123
0x104 0x8048553 0x8048553
increasing addresses
Stack frame is pushed onto program stack stack
stack growth
frame
Upon procedure return
Its frame is popped off of stack who’s
stack
Caller’s stack frame is recovered frame
amI’s
stack
frame
Call chain: foo => who => amI
Keeping track of stack frames
The stack pointer (%esp) moves around
Can be changed within procedure
Problem
How can we consistently find our parameters?
The base pointer (%ebp)
Points to the base of our current stack frame
Also called the frame pointer
Within each function, %ebp stays constant
Most information on the stack is referenced
relative to the base pointer
Base pointer setup is the programmer’s job
Actually usually the compiler’s job
IA32/Linux Stack Frame high addresses
• Resulting
•
• Stack
void swap(int *xp, int *yp)
{
int t0 = *xp; &zip2
int t1 = *yp;
*xp = t1; &zip1
*yp = t0; Rtn adr %esp
}
swap
swap: void swap(int *xp, int *yp)
pushl %ebp {
movl %esp,%ebp Setup int t0 = *xp;
pushl %ebx int t1 = *yp;
*xp = t1;
movl 12(%ebp),%ecx *yp = t0;
movl 8(%ebp),%edx }
movl (%ecx),%eax
movl (%edx),%ebx Body
movl %eax,(%edx)
movl %ebx,(%ecx)
movl -4(%ebp),%ebx
movl %ebp,%esp
popl %ebp
ret Finish
swap Setup #1
Entering Resulting
Stack stack
%ebp %ebp
• •
• •
• •
&zip2 yp
&zip1 xp
Rtn adr %esp Rtn adr
Old %ebp %esp
swap:
pushl %ebp
movl %esp,%ebp
pushl %ebx
swap Setup #2
Resulting
Stack before stack
instruction
%ebp
• •
• •
• •
yp yp
xp xp
Rtn adr Rtn adr
Old %ebp %esp Old %ebp %ebp
%esp
swap:
pushl %ebp
movl %esp,%ebp
pushl %ebx
swap Setup #3
Resulting
Stack before Stack
instruction
• •
• •
• •
yp yp
xp xp
Rtn adr Rtn adr
Old %ebp %ebp Old %ebp %ebp
%esp Old %ebx %esp
swap:
pushl %ebp
movl %esp,%ebp
pushl %ebx
Effect of swap Setup
Entering Resulting
Stack Stack
%ebp
• •
• •
• Offset •
(relative to %ebp)
&zip2 12 yp
&zip1 8 xp
Rtn adr %esp 4 Rtn adr
0 Old %ebp %ebp
12 yp 12 yp
8 xp 8 xp
4 Rtn adr 4 Rtn adr
0 Old %ebp %ebp 0 Old %ebp %ebp
-4 Old %ebx %esp -4 Old %ebx %esp
movl -4(%ebp),%ebx
movl %ebp,%esp
popl %ebp
ret
swap Finish #2
swap’s
• •
Stack • •
• •
Offset Offset
12 yp 12 yp
8 xp 8 xp
4 Rtn adr 4 Rtn adr
0 Old %ebp %ebp 0 %ebp
Old %ebp
-4 Old %ebx %esp %esp
movl -4(%ebp),%ebx
movl %ebp,%esp
popl %ebp
ret
swap Finish #3
%ebp
swap’s swap’s
•
Stack
• Stack •
•
• •
Offset Offset
12 12 yp
yp
8 8 xp
xp
4 4 Rtn adr
Rtn adr
%esp
0 Old %ebp %ebp
%esp
movl -4(%ebp),%ebx
movl %ebp,%esp
popl %ebp
ret
swap Finish #4
%ebp
swap’s %ebp
• •
Stack • •
•
Offset
• Exiting
Stack
12 yp &zip2
8 xp &zip1 %esp
4 Rtn adr
%esp
movl -4(%ebp),%ebx
movl %ebp,%esp
popl %ebp
ret
swap void swap(int *xp, int *yp)
{
int t0 = *xp;
int t1 = *yp;
*xp = t1;
*yp = t0;
}
swap: Setup
pushl %ebp Save old %ebp of caller frame
movl %esp,%ebp Set new %ebp for callee (current) frame
Save state of %ebx register from caller
pushl %ebx
Body
movl 12(%ebp),%ecx
Retrieve parameter yp from caller frame
movl 8(%ebp),%edx Retrieve parameter xp from caller frame
movl (%ecx),%eax
movl (%edx),%ebx Perform swap
movl %eax,(%edx)
movl %ebx,(%ecx)
WinINet API
Bytes over HTTP instead of socket
InternetOpen, InternetOpenURL,
InternetReadFile
Dynamically-linked libraries (DLLs)
Used in 3 ways by malware
Store malicious code in standard DLL or
custom one
Inject into a process via a LoadLibrary call
Leverage standard Windows DLLs to interact
with OS
Leverage third-party DLLs (e.g. Firefox DLL) to
avoid re-implementing functions
Process functions
Execute code outside of current process
CreateProcess
Listing 7-4, p. 148
Hijack execution of current process
Injecting code via debugger or DLLs
Kill processes
(e.g. anti-virus, Zone Alarm, etc.)
Threading functions
Windows threads share same memory space but
have separate registers and stack
Used by Malware to insert a malicious DLL into a
process's address space
CreateThread with address of LoadLibrary as start
address
Also used to remotely control a process
Two threads created
One takes network input and sends to process stdin (via
WriteFile)
One takes process stdout (via ReadFile) and sends to
network
Listing 7-6, 7-7, 7-8, p. 150-151
Service functions
Service processes run in the background
Scheduled and run by Windows service
manager without user input
Common calls
OpenSCManager, CreateService, StartService
Allows malware to maintain persistence
Types
WIN32_SHARE_PROCESS = allows multiple processes
to contact service (e.g. svchost.exe)
WIN32_OWN_PROCESS = independent process
KERNEL_DRIVER = loads code into kernel
COM functions
Microsoft Component Object Model
Interface standard that allows software components
to call each other
OleInitialize, CoInitializeEx to begin use
Navigate function in IWebBrowser2 interface
Used with CoCreateInstance to launch browser