ARM mbed OS derives its thread management capabilities from ARM’s RTX realtime operating system.
When a thread crashes due to a stack overflow or other HardFaults, it can be unclear what code is causing the issue. Also unclear is how the threads are initially created, as ARM mbed OS defines its stack sizes via a handful of preprocessor #define
s and linker provide
s which aren’t well documented at all.
Here are some notes on the things I’ve had to figure out by reading the source.
The Crash
The first thing that happened, was a crash. One particular piece of code would run, and then RTX would throw the error:
RTX error code: 0x00000001, task ID: 0x200094FC
Digging through the codebase leads to RTX_Conf_CM.c
:
1
2
3
4
5
6
|
void os_error (uint32_t err_code) {
/* This function is called when a runtime error is detected. Parameter */
/* 'err_code' holds the runtime error code (defined in RTX_Config.h). */
osThreadId err_task = svcThreadGetId();
error("RTX error code: 0x%08X, task ID: 0x%08X\n", err_code, err_task);
}
|
Callers of os_error
look like this:
1
2
3
4
5
6
7
8
9
10
|
Usages of Function os_error(uint32_t): [6 occurrences, 2 filtered]
rt_CMSIS.c
1301: os_error(OS_ERR_TIMER_OVF);
rt_List.c
312: os_error (OS_ERR_FIFO_OVF);
rt_Mailbox.c
286: os_error (OS_ERR_MBX_OVF);
rt_System.c
318: os_error (OS_ERR_STK_OVF);
|
If we look at where OS_ERR_STK_OVF
is invoked, it matches the rt_stk_check
function:
1
2
3
4
5
6
|
__weak void rt_stk_check (void) {
if ((os_tsk.run->tsk_stack < (U32)os_tsk.run->stack) ||
(os_tsk.run->stack[0] != MAGIC_WORD)) {
os_error (OS_ERR_STK_OVF);
}
}
|
And the value of OS_ERR_STK_OVF
is:
1 |
#define OS_ERR_STK_OVF 1U |
Now, it was clear that one of the running threads was causing a stack overflow.
Threads
But which piece of code corresponded to the thread?
The task ID is provided by the os_error
crash output. To figure out which code was running at the time, you need to to enumerate the threads in the system and list their function entry points.
I added the following functions:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
|
static void printThreadId(void)
{
osThreadId threadId = osThreadGetId();
printf("threadId: %p\r\n", threadId);
}
static void printAllThreadInfo(void)
{
osThreadEnumId enumId = _osThreadsEnumStart();
osThreadId threadId = NULL;
osEvent event;
while ((threadId = _osThreadEnumNext(enumId)))
{
printf("threadId: %p, ", threadId);
event = _osThreadGetInfo(threadId, osThreadInfoEntry);
printf("entry: %p, ", event.value.p);
event = _osThreadGetInfo(threadId, osThreadInfoState);
printf("osThreadInfoState: %lu, ", event.value.v);
event = _osThreadGetInfo(threadId, osThreadInfoStackSize);
printf("osThreadInfoStackSize: %lu, ", event.value.v);
event = _osThreadGetInfo(threadId, osThreadInfoStackMax);
printf("osThreadInfoStackMax: %lu", event.value.v);
printf("\r\n");
}
_osThreadEnumFree(enumId);
}
|
Now, when I enter main()
, the first thing I see is:
threadId: 0x200094fc threadId: 0x200094bc, entry: 0x23161, osThreadInfoState: 0, osThreadInfoStackSize: 800, osThreadInfoStackMax: 0 threadId: 0x200094fc, entry: 0x22541, osThreadInfoState: 0, osThreadInfoStackSize: 2048, osThreadInfoStackMax: 0 threadId: 0x200096d8, entry: 0x22589, osThreadInfoState: 0, osThreadInfoStackSize: 512, osThreadInfoStackMax: 0
With the thread entry point address, I can search the ELF file generated by the ARM gcc compiler for corresponding code:
arm-none-eabi-nm.exe mbed5.elf | less 00023160 T osTimerThread 00022540 T pre_main 00022588 T os_idle_demon
The addresses are slightly different (off by one, not sure why), but it’s clear that these are the thread entry functions we want.
Stack Regions
So how do these functions allocate their stack regions?
For the idle thread:
1
2
3
4
5
6
7
8
9
10
|
#define _declare_box8(pool,size,cnt) uint64_t pool[(((size)+7)/8)*(cnt) + 2]
#ifdef __MBED_CMSIS_RTOS_CM
/* Memory pool for os_idle_demon stack allocation. */
_declare_box8 (mp_stk, OS_IDLESTKSIZE*4, 1);
uint32_t const mp_stk_size = sizeof(mp_stk);
#else
rt_init_box (mp_stk, mp_stk_size, BOX_ALIGN_8 | (U16)(os_stackinfo));
rt_init_context (&os_idle_TCB, 0U, os_idle_demon);
|
For the timer thread:
1 |
osThreadDef(osTimerThread, (osPriority)(OS_TIMERPRIO-3), 4*OS_TIMERSTKSZ);
|
For the main thread:
1
2
3
4
5
6
7
8
9
10
11
12
|
// The stack space occupied is mainly dependent on the underling C standard library
#if defined(TOOLCHAIN_GCC) || defined(TOOLCHAIN_ARM_STD) || defined(TOOLCHAIN_IAR)
# define WORDS_STACK_SIZE 512
#elif defined(TOOLCHAIN_ARM_MICRO)
# define WORDS_STACK_SIZE 128
#endif
#define DEFAULT_STACK_SIZE (WORDS_STACK_SIZE*4)
static uint32_t thread_stack_main[DEFAULT_STACK_SIZE / sizeof(uint32_t)];
osThreadDef_t os_thread_def_main = {(os_pthread)pre_main, osPriorityNormal, 1U, sizeof(thread_stack_main), thread_stack_main};
|
For the interrupt service routines + OS scheduler:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
|
/* Define stack sizes if they haven't been set already */
#if !defined(ISR_STACK_SIZE)
#define ISR_STACK_SIZE ((uint32_t)OS_MAINSTKSIZE * 4)
#endif
/*
* set_stack_heap purpose is to set the following variables:
* -mbed_heap_start
* -mbed_heap_size
* -mbed_stack_isr_start
* -mbed_stack_isr_size
*
* Along with setting up os_thread_def_main
*/
void set_stack_heap(void) {
unsigned char *free_start = HEAP_START;
uint32_t free_size = HEAP_SIZE;
#ifdef ISR_STACK_START
/* Interrupt stack explicitly specified */
mbed_stack_isr_size = ISR_STACK_SIZE;
mbed_stack_isr_start = ISR_STACK_START;
#else
/* Interrupt stack - reserve space at the end of the free block */
mbed_stack_isr_size = ISR_STACK_SIZE;
mbed_stack_isr_start = free_start + free_size - mbed_stack_isr_size;
free_size -= mbed_stack_isr_size;
#endif
/* Heap - everything else */
mbed_heap_size = free_size;
mbed_heap_start = free_start;
}
|
Stack Region Defines
The above code defines the regions according to the mbed Memory Model.
+-------------------+ Last Address of RAM | Scheduler Stack | +-------------------+ | | RAM | | | ^ | | | | | Heap Cont.. | +-------------------+ | app thread n | |-------------------| | app thread 2 | |-------------------| | app thread 1 | |-------------------| | ^ | | | | | Heap | +-------------------+ | ZI | +-------------------+ | ZI: OS drv stack | +-------------------+ | ZI: app thread 3 | +-------------------+ | ZI: Idle Stack | +-------------------+ | ZI: Timer Stack | +-------------------+ | ZI: Main Stack | +-------------------+ | RW | +===================+ First Address of RAM | | | | Flash
The proprocessor defines controlling these regions are as follows:
Thread | Define | Default | Total Bytes |
---|---|---|---|
idle | OS_IDLESTKSIZE | 128 | x4 = 512 |
timer | OS_TIMERSTKSZ | 200 | x4 = 512 |
main | WORDS_STACK_SIZE | 512 | x4 = 2048 |
isr/scheduler | ISR_STACK_SIZE | 2048 |
Oddly, also, it appears that OS_MAINSTKSIZE
isn’t being used properly, as thread_stack_main[]
is defined transitively via DEFAULT_STACK_SIZE → WORDS_STACK_SIZE → TOOLCHAIN_GCC.
Printing Runtime Memory Map on Cortex-M / RTX
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
|
#include "rt_TypeDef.h"
extern "C" P_TCB rt_tid2ptcb (osThreadId thread_id);
static void printRuntimeMemoryMap(void)
{
extern unsigned char * mbed_heap_start;
extern uint32_t mbed_heap_size;
extern unsigned char * mbed_stack_isr_start;
extern uint32_t mbed_stack_isr_size;
osThreadEnumId enumId = _osThreadsEnumStart();
osThreadId threadId = 0;
while ((threadId = _osThreadEnumNext(enumId)))
{
P_TCB tcb = rt_tid2ptcb(threadId);
// NOTE: Idle thread seems to be missing its stack size.
printf("threadId: %p, stack_start: %p, stack_end: %p, size: %u\r\n", threadId, tcb->stack, ((uint8_t *) tcb->stack + (tcb->priv_stack)), tcb->priv_stack);
}
_osThreadEnumFree(enumId);
printf(" mbed_heap_start: %p, mbed_heap_end: %p, size: %lu\r\n", mbed_heap_start, (mbed_heap_start + mbed_heap_size), mbed_heap_size);
printf(" mbed_stack_isr_start: %p, mbed_stack_isr_end: %p, size: %lu\r\n", mbed_stack_isr_start, (mbed_stack_isr_start + mbed_stack_isr_size), mbed_stack_isr_size);
}
|
Which prints:
threadId: 0x200094bc, stack_start: 0x20008ed8, stack_end: 0x200091f8, size: 800 threadId: 0x200094fc, stack_start: 0x200066c8, stack_end: 0x20006ec8, size: 2048 threadId: 0x200096d8, stack_start: 0x20009228, stack_end: 0x20009228, size: 0 mbed_heap_start: 0x2000977c, mbed_heap_end: 0x2000f800, size: 24708 mbed_stack_isr_start: 0x2000f800, mbed_stack_isr_end: 0x20010000, size: 2048
The Solution
Fixing up the main thread stack to use 4KB instead of 2KB solved the stack overflow.
The hardest part was finding the correct compiler #define
to change.
At runtime, the thread statistics printer shows exactly how much memory over 2KB is used (when -DMBED_HEAP_STATS_ENABLED=1
-DMBED_STACK_STATS_ENABLED=1
):
1
2
3
|
threadId: 0x20009cbc, entry: 0x231d5, osThreadInfoState: 0, osThreadInfoStackSize: 800, osThreadInfoStackMax: 112
threadId: 0x20009cfc, entry: 0x225b5, osThreadInfoState: 0, osThreadInfoStackSize: 4096, osThreadInfoStackMax: 2416
threadId: 0x20009ed8, entry: 0x225fd, osThreadInfoState: 0, osThreadInfoStackSize: 512, osThreadInfoStackMax: 72
|
A Better Solution
Instead of hacking the cmsis_os.h
header (which isn’t really portable and would be an eternal out-of-tree patch), the better solution is to run the code in a new thread that has more than 2KB stack.
Under mbed + RTX, you do this:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
|
static void bigtask_main(const void * args)
{
for (;;)
{
// Your code here.
}
}
osThreadDef(bigtask_main, osPriorityNormal, 8 * 1024);
int main(void)
{
osThreadCreate(osThread(bigtask_main), NULL);
}
|
“The addresses are slightly different (off by one, not sure why)” Its because of ARM-Thumb instruction set – it is distinguished by set last significant bit in calling (e.g. a Thread, a Vector) addresses.
Thanks for the explanation!
Thanks a bunch and nice work.