Build your own utils Library in embedded Projects - Basic space-time cost calculation of different chips

Recently, I read the book "programming pearls", which focuses on the explanation of engineering treatment, which is well written. The most impressive example is the Brooklyn Bridge. Before the 1940s, many suspension bridges broke due to storms. To do the correct mathematical modeling and engineering calculation of this aerodynamic rise phenomenon, it will take 10 years. At the same time, the Brooklyn Bridge did not break. John Roebling, an engineer, said that he knew the existence of aerodynamic rise, but he could not deal with it (modeling) correctly in essence, so he designed the bridge according to six times the required strength. Finally, the author suggests that we reduce the performance estimation by a coefficient of 2, 4 or 6. In fact, it is necessary to reserve enough performance. The author finally said, "are we the same engineers as John Roebling? I doubt it.".
To get back to business, the book mentions the space-time overhead model, which is used to calculate the space-time overhead in the code, and can be easily used for code performance estimation and code optimization. This article has sorted out this code, which can be added to the utils library and run on ESP32 and STM32F103C8.

Space occupancy of basic types

#define SIZE_MEASURE(T) \
do { \
    printf("sizeof(%s)=%d\n", #T, sizeof(T)); \
} while (0)

void platform_base_type_size_measure(void)
{
    printf("\nplatform_base_type_size_measure\n");
    SIZE_MEASURE(char);
    SIZE_MEASURE(short);
    SIZE_MEASURE(int);
    SIZE_MEASURE(long);
    SIZE_MEASURE(long long);
    SIZE_MEASURE(float);
    SIZE_MEASURE(double);
    SIZE_MEASURE(int*);
}

The previous knowledge points are used here. The macro definition parameters are prefixed #, which can be converted into strings.
result:

platform_base_type_size_measure
sizeof(char)=1
sizeof(short)=2
sizeof(int)=4
sizeof(long)=4
sizeof(long long)=8
sizeof(float)=4
sizeof(double)=8
sizeof(int*)=4

Because both esp32 and stm32 are 32-bit platforms, int is 4 bytes and pointer is 4 bytes.

Structure alignment and malloc footprint

typedef struct{char c;} structc;
typedef struct {int i; char c;} structic;
typedef struct {int i; struct structip *p;} structip;
typedef struct {double d; char c;} structdc;
typedef struct {char c; double d;} structcd;
typedef struct {char c; double d; char cc;} structcdc;
typedef struct {int i; int ii; int iii;} structiii;
typedef struct {int i; int ii; char c;} structiic;
typedef struct {char c[12];} struct12;
typedef struct {char c[13];} struct13;
typedef struct {char c[28];} struct28;
typedef struct {char c[29];} struct29;

#define SIZE_MALLOC_MEASURE(T) \
do { \
    printf("%s%c\t", #T, strlen(#T)<8?'\t':' '); \
    printf("%d\t", sizeof(T)); \
    int lastp = 0; \
    for(int i=0; i<11; i++) { \
        T* p = (T*)malloc(sizeof(T)); \
        uint32_t thisp = (uint32_t)p; \
        if(lastp != 0) \
            printf(" %d", thisp - lastp); \
        lastp = thisp; \
    } \
    printf("\n"); \
} while (0)

void platform_struct_size_measure(void)
{
    printf("\nstruct\t\tsizeof\t malloc size\n");
    SIZE_MALLOC_MEASURE(int);
    SIZE_MALLOC_MEASURE(structc);
    SIZE_MALLOC_MEASURE(structic);
    SIZE_MALLOC_MEASURE(structip);
    SIZE_MALLOC_MEASURE(structdc);
    SIZE_MALLOC_MEASURE(structcd);
    SIZE_MALLOC_MEASURE(structcdc);
    SIZE_MALLOC_MEASURE(structiii);
    SIZE_MALLOC_MEASURE(structiic);
    SIZE_MALLOC_MEASURE(struct12);
    SIZE_MALLOC_MEASURE(struct13);
    SIZE_MALLOC_MEASURE(struct28);
    SIZE_MALLOC_MEASURE(struct29);
}

The naming rule is: c:chart i:int d:double, and they are spliced with each other.
result:

struct          sizeof   malloc size
int             4        16 267900 16 16 16 16 16 16 16 16
structc         1        16 16 16 16 16 16 16 16 16 16
structic        8        16 16 16 16 16 16 16 16 16 16
structip        8        16 16 16 16 16 16 16 16 16 16
structdc        16       20 20 20 20 20 524 20 20 20 20
structcd        16       20 20 20 20 20 20 20 20 20 20
structcdc       24       28 28 28 28 28 28 28 28 28 28
structiii       12       16 16 16 16 16 16 16 16 16 16
structiic       12       16 16 16 16 16 16 16 16 16 16
struct12        12       16 16 16 16 16 16 16 16 16 16
struct13        13       20 20 20 20 20 20 20 20 20 20
struct28        28       32 32 32 32 32 32 32 32 32 32
struct29        29       36 36 36 36 36 36 36 36 36 36

It can be seen that the structure will be aligned and completed, and 8 bytes will be supplemented with a double. Therefore, if you want to use the structure conversion protocol in the communication code, remember to use #pragma pack(1) to align by 1 byte.
Malloc will occupy extra space when applying for memory, and it does not necessarily grow linearly, which is related to the implementation of malloc.

Shaping operator time consumption

#define INT_TIME_MEASURE(OP, n) \
do { \
    printf("%s%c\t", #OP, strlen(#OP)<8?'\t':' '); \
    int64_t timesum = 0; \
    for(int ttt=0; ttt<5; ttt++) { \
        int64_t start = esp_timer_get_time(); \
        for(int i=1; i<=n; i++) { \
            for(int j=1; j<=n; j++) { \
                OP; \
            } \
        } \
        int64_t t = esp_timer_get_time() - start; \
        printf("%lld ", t); \
        timesum += t; \
    } \
    printf("\t%lld\n", 1000*timesum / (n*n*5)); \
} while (0)

void platform_int_time_measure(void)
{
    int loop_n = 500;
    printf("\nplatform_int_time_measure (n=%d)\n", loop_n);
    printf("oprate\t\ttime(us)\t\ttime avg(ns)\n");
    volatile int k = 0;
    INT_TIME_MEASURE({}, loop_n);
    INT_TIME_MEASURE(k++, loop_n);
    INT_TIME_MEASURE(k=i+j, loop_n);
    INT_TIME_MEASURE(k=i-j, loop_n);
    INT_TIME_MEASURE(k=i*j, loop_n);
    INT_TIME_MEASURE(k=i/j, loop_n);
    INT_TIME_MEASURE(k=i%j, loop_n);
    INT_TIME_MEASURE(k=i&j, loop_n);
    INT_TIME_MEASURE(k=i|j, loop_n);
}

Because it involves time calculation, we have to use the timing function. Here is the implementation of ESP32. Because the running speed is very fast and the timer resolution is not high, the method of taking the average value through multiple runs is adopted.
result:

//ESP32
platform_int_time_measure (n=500)
oprate          time(us)                        time avg(ns)
{}              1 1 1 1 1       0
k++             11473 11475 11471 11471 11471   45
k=i+j           7306 7305 7301 7305 7305        29
k=i-j           7302 7305 7304 7305 7301        29
k=i*j           9390 9390 9390 9390 9390        37
k=i/j           9621 9620 9620 9621 9621        38
k=i%j           10664 10663 10662 10662 10662   42
k=i&j           8347 8346 8346 8346 8343        33
k=i|j           8347 8346 8347 8346 8346        33

//STM32F103C8
platform_int_time_measure (n=500)
oprate        time(us)                         time avg(ns)
{}            27290 27290 27280 27280 27290     109
k++           39150 39150 39150 39150 39150     156
k=i+j        39060 39060 39070 39070 39070     156
k=i-j        35110 35110 35110 35110 35100     140
k=i*j        35180 35180 35180 35180 35180     140
k=i/j        44410 44410 44410 44410 44410     177
k=i%j        63790 63790 63790 63790 63790     255
k=i&j        39060 39070 39070 39060 39060     156
k=i|j        46820 46820 46820 46820 46830     187

The self increment operation of ESP32 is time-consuming and strange. Self increment and addition are the same on STM32.
The division of ESP32 is not more time-consuming, and the division of STM32 is more time-consuming.
What I usually don't notice is that the remainder operation is very time-consuming. And operation can not save time.

Floating point operator time consumption

#define FLOAT_TIME_MEASURE(OP, n) \
do { \
    printf("%s%c\t", #OP, strlen(#OP)<8?'\t':' '); \
    int64_t timesum = 0; \
    volatile float fi,fj; \
    for(int ttt=0; ttt<5; ttt++) { \
        int64_t start = esp_timer_get_time(); \
        for(int i=1; i<=n; i++) { \
            fi = i; \
            for(int j=1; j<=n; j++) { \
                fj = j; \
                OP; \
            } \
        } \
        int64_t t = esp_timer_get_time() - start; \
        printf("%lld ", t); \
        timesum += t; \
    } \
    printf("\t%lld\n", 1000*timesum / (n*n*5)); \
} while (0)

void platform_float_time_measure(void)
{
    int loop_n = 100;
    printf("\nplatform_float_time_measure (n=%d)\n", loop_n);
    printf("oprate\t\ttime(us)\t\ttime avg(ns)\n");
    volatile float fk = 0;
    FLOAT_TIME_MEASURE({}, loop_n);
    FLOAT_TIME_MEASURE(fk=fi+fj, loop_n);
    FLOAT_TIME_MEASURE(fk=fi-fj, loop_n);
    FLOAT_TIME_MEASURE(fk=fi*fj, loop_n);
    FLOAT_TIME_MEASURE(fk=fi/fj, loop_n);
}

result:

//ESP32
platform_float_time_measure (n=100)
oprate          time(us)                        time avg(ns)
{}              1685 1683 1683 1683 1683        168
fk=fi+fj        3941 3935 3939 3936 3935        393
fk=fi-fj        4516 4511 4514 4510 4514        451
fk=fi*fj        4293 4291 4295 4291 4295        429
fk=fi/fj        14036 14037 14034 14033 14037   1403

//STM32
platform_float_time_measure (n=100)
oprate        time(us)                         time avg(ns)
{}            6000 6010 6000 6000 6010     600
fk=fi+fj     15670 15680 15680 15680 15680     1567
fk=fi-fj     14760 14760 14760 14760 14760     1476
fk=fi*fj     13920 13920 13920 13920 13920     1392
fk=fi/fj     20790 20790 20790 20790 20790     2079

In the process, the integer needs to be assigned to the floating point first, which consumes a lot of time. So you need to subtract the empty run time. Even the ESP32 with FPU still consumes a lot of time.

Array operator time consumption

void platform_array_time_measure(void)
{
    int loop_n = 500;
    printf("\nplatform_array_time_measure (n=%d)\n", loop_n);
    printf("oprate\t\ttime(us)\t\ttime avg(ns)\n");
    volatile int k = 0;
    int x[loop_n+1];
    INT_TIME_MEASURE(k=i+j, loop_n);
    INT_TIME_MEASURE(k=x[i]+j, loop_n);
    INT_TIME_MEASURE(k=i+x[j], loop_n);
    INT_TIME_MEASURE(k=x[i]+x[j], loop_n);
}

Macros that directly use integers have little difference between the results and integer operations

platform_array_time_measure (n=500)
oprate          time(us)                        time avg(ns)
k=i+j           7302 7304 7304 7301 7304        29
k=x[i]+j        7313 7313 7309 7312 7313        29
k=i+x[j]        10431 10429 10430 10430 10430   41
k=x[i]+x[j]     10432 10432 10432 10432 10432   41

Operator and math library time consumption

void platform_function_time_measure(void)
{
    int loop_n = 100;
    printf("\nplatform_function_time_measure (n=%d)\n", loop_n);
    printf("oprate\t\ttime(us)\t\ttime avg(ns)\n");
    volatile int k = 0;
    volatile float fk = 0;
    INT_TIME_MEASURE(k=(i>j)?i:j, loop_n);
    INT_TIME_MEASURE(rand(), loop_n);
    FLOAT_TIME_MEASURE(fk=sqrt(j+fi), loop_n);
    FLOAT_TIME_MEASURE(fk=sin(j+fi), loop_n);
    // FLOAT_TIME_MEASURE(fk=sinh(j+fi), loop_n);
    // FLOAT_TIME_MEASURE(fk=asin(j+fi), loop_n);
    // FLOAT_TIME_MEASURE(fk=cos(j+fi), loop_n);
    // FLOAT_TIME_MEASURE(fk=tan(j+fi), loop_n);
}

result:

platform_function_time_measure (n=100)
oprate          time(us)                        time avg(ns)
k=(i>j)?i:j     337 336 336 336 336             33
rand()          3763 3753 3756 3752 3756        375
fk=sqrt(j+fi)   58442 58416 58417 58419 58416   5842
fk=sin(j+fi)    111011 110967 110970 110967 110971      11097

The square is not as time-consuming as expected, and the trigonometric function is really time-consuming. If the accuracy requirement is not high, such as breathing lamp, it is strongly recommended to use fast trigonometric function.

malloc time consumption

#define ALLOC_TIME_MEASURE(OP, n) \
do { \
    printf("%s%c\t", #OP, strlen(#OP)<8?'\t':' '); \
    int64_t timesum = 0; \
    int64_t start = esp_timer_get_time(); \
    for(int i=1; i<=n; i++) { \
        OP; \
    } \
    int64_t t = esp_timer_get_time() - start; \
    printf("%lld ", t); \
    timesum += t; \
    printf("\t\t%lld\n", 1000*timesum / (n)); \
} while (0)

#define ALLOC_FREE_MEASURE(OP, n) \
do { \
    printf("%s%c\t\t", #OP, strlen(#OP)<8?'\t':' '); \
    int64_t timesum = 0; \
    int64_t start = esp_timer_get_time(); \
    for(int i=1; i<=n; i++) { \
        OP; \
    } \
    int64_t t = esp_timer_get_time() - start; \
    printf("%lld ", t); \
    timesum += t; \
    printf("\t\t%lld\n", 1000*timesum / (n)); \
} while (0)

void platform_alloc_time_measure(void)
{
    int loop_n = 100;
    printf("\nplatform_alloc_time_measure (n=%d)\n", loop_n);
    printf("oprate\t\t\ttime(us)\ttime avg(ns)\n");
    volatile int* k[loop_n+1];
    ALLOC_TIME_MEASURE(k[i]=malloc(16), loop_n);
    ALLOC_FREE_MEASURE(free(k[i]), loop_n);
    ALLOC_TIME_MEASURE(k[i]=malloc(100), loop_n);
    ALLOC_FREE_MEASURE(free(k[i]), loop_n);
    ALLOC_TIME_MEASURE(k[i]=malloc(2000), loop_n);
    ALLOC_FREE_MEASURE(free(k[i]), loop_n);
}

result:

platform_alloc_time_measure (n=100)
oprate                  time(us)        time avg(ns)
k[i]=malloc(16)         317             3170
free(k[i])              195             1950
k[i]=malloc(100)        317             3170
free(k[i])              194             1940
k[i]=malloc(2000)       613             6130
free(k[i])              189             1890

free time is relatively stable. malloc time consumption increases with the increase of space, but it is not linear. And the time-consuming of these two can not be ignored.

End

Through these comparisons, malloc and free, which were previously ignored, are also large consumers of time and space. Congruence and operation are also noteworthy.
With these comparisons of space and time consumption, I believe you can have a further understanding of the platform you use. At the same time, when you need to optimize the code in the future, you can also have a clear direction.

Keywords: C Embedded system performance

Added by rickaclark on Mon, 17 Jan 2022 11:06:43 +0200