Wednesday, 14 November 2012

Scripting MSR Performance Tests With kdb+: Part 2

This post continues the series on performance monitoring with Intel MSRs on Linux using the batch-oriented kernel module to read and write values from and to the MSRs. The previous posts can be found here:
- A Linux Module For Reading/Writing MSRs
- Intel MSR Performance Monitoring Basics
- Fun with MSRs: Counting Performance Events On Intel
- Scripting MSR Performance Tests With kdb+
- Scripting MSR Performance Tests With kdb+: Part 2 (this post ;)
- Intel Performance Monitoring: Loose Ends

This time I'm going to build the shared library used by kdb+ to launch and control the test run. It's fairly simple, since the fiddly work of calculating the values to be written to the IA32_PERFEVTSELx, IA32_FIXED_CTR_CTRL and IA32_PERF_GLOBAL_CTRL MSRs has already been done. What it will do is own the process of stopping, clearing and staring the counters, as well as running a baseline to test the fixed costs of the interation with the MSR kernel module.

The following q instructs kdb+ to load the function runtest from the shared library The rules for locating the shared library are fairly simple and documented on the site, but it's probably enough to know that it eventually consults the environment variable LD_LIBRARY_PATH. The command instructs kdb+ to treat the runtest function as taking five arguments. Unlike a compiler, which will check the arity of the function for you, it won't warn you if you get the number of arguments wrong!

.pmc.runtestdl:`libpmc 2:(`runtest;5);

The runtest function from libpmc.c is the entry point to the shared library code from kdb+. The parameters and return types are all of type K, which is the wrapper kdb for each of its objects. The K (strictly k0) struct contains fields for reference counting, type descriptor, optionally a count value (for vector-types) and then the payload. The function sets up some stack storage before writing the MsrInOut values computed by pmc.q to memory for later execution by the MSR kernel driver. After loading the driver it delegates the test set-up to the run_test_internal function.

Function runtest from libpmc.c

K runtest(K opv, K ecxv, K eaxv, K edxv, K testCount)
    struct MsrInOut s_pmc_reset[9];
    struct MsrInOut s_pmc_read[9];
    unsigned long long s_ffc_fixed[FFC_COUNT];
    unsigned long long s_pmc_fixed[PMC_COUNT];
    struct MsrInOut *ptr;
    int i;
    long long count;
    K result;

    // set the global (static) pointers
    ffc_fixed = s_ffc_fixed;
    pmc_fixed = s_pmc_fixed;
    pmc_reset = s_pmc_reset;
    pmc_read = s_pmc_read;
    ptr = pmc_cfg = (struct MsrInOut*)malloc((opv->n + 1) * sizeof(struct MsrInOut));

    if (pmc_cfg == NULL) {
        return (K)0;


    // record the PMC instructions to memory
    count = opv->n;
    for (i = 0 ; i < count ; i++) {
        wr_msrio(ptr++, kI(opv)[i], kI(ecxv)[i], kI(eaxv)[i], kI(edxv)[i]);

    if (fd == -1) {
        return (K)0;
    result = run_test_internal(testCount->i);

    // disable and zero the PMC MSRs
    ioctl(fd, IOCTL_MSR_CMDS, (long long)s_pmc_reset);

    // return the dynamically allocated memory
    // close the MSR driver

    return result;

The following segment of libpmc.c shows the run_test_internal function as well as the controller functions for starting and stopping the PMC counters. The run_test_internal function zeros the accumulators into which the baseline, fixed-cost values are written, then instantiates up the result vectors before delegating execution to the test-harness itself.

Function run_test_internal from libpmc.c

#define FFC_COUNT 3
#define PMC_COUNT 4

extern void execute_baseline(int times, void (start_counters)(void), void (stop_counters)(void));
extern void execute_test(void (start_counters)(void), void (stop_counters)(void));

void start_counters()
    ioctl(fd, IOCTL_MSR_CMDS, (long long)pmc_cfg);

void stop_counters()
    ioctl(fd, IOCTL_MSR_CMDS, (long long)pmc_read);

void start_baseline()
    ioctl(fd, IOCTL_MSR_CMDS, (long long)pmc_cfg);

void stop_baseline()
    ioctl(fd, IOCTL_MSR_CMDS, (long long)pmc_read);

static K run_test_internal(int testCount)
    int i;
    K result, kffc[3], kpmc[4];

    for (i = 0 ; i < PMC_COUNT ; i++)
        pmc_fixed[i] = 0;
    for (i = 0 ; i < FFC_COUNT ; i++)
        ffc_fixed[i] = 0;

    ioctl(fd, IOCTL_MSR_CMDS, (long long)pmc_reset);
    execute_baseline(testCount, &start_baseline, &stop_baseline);
    pmc_fixed[0] = pmc_read[1].value / testCount;
    pmc_fixed[1] = pmc_read[2].value / testCount;
    pmc_fixed[2] = pmc_read[3].value / testCount;
    pmc_fixed[3] = pmc_read[4].value / testCount;
    ffc_fixed[0] = pmc_read[5].value / testCount;
    ffc_fixed[1] = pmc_read[6].value / testCount;
    ffc_fixed[2] = pmc_read[7].value / testCount;

    for (i = 0 ; i < PMC_COUNT ; i++)
        kpmc[i] = ktn(KJ, testCount);

    for (i = 0 ; i < FFC_COUNT ; i++)
        kffc[i] = ktn(KJ, testCount);

    for (i = 1 ; i < 1 + PMC_COUNT + FFC_COUNT ; i++)
        pmc_read[i].value = 0;

    for (i = 0 ; i < testCount ; i++) {
        ioctl(fd, IOCTL_MSR_CMDS, (long long)pmc_reset);
        execute_test(&start_counters, &stop_counters);
        kJ(kpmc[0])[i] = pmc_read[1].value - pmc_fixed[0];
        kJ(kpmc[1])[i] = pmc_read[2].value - pmc_fixed[1];
        kJ(kpmc[2])[i] = pmc_read[3].value - pmc_fixed[2];
        kJ(kpmc[3])[i] = pmc_read[4].value - pmc_fixed[3];
        kJ(kffc[0])[i] = pmc_read[5].value - ffc_fixed[0];
        kJ(kffc[1])[i] = pmc_read[6].value - ffc_fixed[1];
        kJ(kffc[2])[i] = pmc_read[7].value - ffc_fixed[2];
    result = knk(7, kffc[0], kffc[1], kffc[2], kpmc[0], kpmc[1], kpmc[2], kpmc[3]);
    return result;

Getting a representative fixed-cost baseline

The run_test_internal delegates the generation of baseline values for the fixed-costs of starting and stopping the performance counters to an external function. That function should simulate as closely as possible the "shoe-leather" costs associated with invoking the start_baseline and stop_baseline functions. If the code under test is short with no external dependencies on its output, you could get away with providing an implementation of execute_baseline which does no more than the following:

void execute_baseline(int times, void (start_baseline)(void), void (stop_baseline)(void))
    int i;
    for (i = 0 ; i < times ; i++) {

On the other hand, if you have a longer piece of code and you care about preserving the state of the registers, you would need to use an execute_baseline implementation which reflected the stores from register to stack of any data in non-durable registers in the same way the compiler will do for the code under test after the introduction of the calls to start_counters and stop_counters. By "non-durable", I mean data stored in the SSE or AVX registers, as well as general purpose registers not in the set { rbx, rbp, rsp, r12, r13, r14, r15 }.

The whole point about passing function pointers to the test to control the performance monitoring is that it makes it possible to invoke them around the smallest sections of your code - and give or take some jitter measure its performance. For example, I've listed below an entirely hideous Gnu assembler macro which pushes all of the non-durable registers and some of the xmm registers onto the stack, and another to do the reverse - and so my execute_baseline implementation invokes those macros between the calls to start_baseline and stop_baseline. Not rocket science, but something may only notice if you were looking at your compiler's assembly output. Put another way, introducing the start_counters and stop_counters calls into your code has side-effects which you should take into account.

Example of a slightly elaborate baseline implementation

.macro m_save_regs
        movaps    %xmm0, -0x10(%rsp)
        movaps    %xmm1, -0x20(%rsp)
        movaps    %xmm2, -0x30(%rsp)
        movaps    %xmm3, -0x40(%rsp)
        movq      %rax,  -0x48(%rsp)
        movq      %rbx,  -0x50(%rsp)
        movq      %rcx,  -0x58(%rsp)
        movq      %rdx,  -0x60(%rsp)
        movq      %rdi,  -0x68(%rsp)
        movq      %rsi,  -0x70(%rsp)
        movq      %r8,   -0x78(%rsp)
        movq      %r9,   -0x80(%rsp)
        movq      %r12,  -0x88(%rsp)
        movq      %r13,  -0x90(%rsp)
        movq      %r14,  -0x98(%rsp)
        movq      %r15,  -0xa0(%rsp)
        sub       $0xa0, %rsp

.macro m_restore_regs
        add       $0xa0, %rsp
        movq      -0xa0(%rsp), %r15
        movq      -0x98(%rsp), %r14
        movq      -0x90(%rsp), %r13
        movq      -0x88(%rsp), %r12
        movq      -0x80(%rsp), %r9
        movq      -0x78(%rsp), %r8
        movq      -0x70(%rsp), %rsi
        movq      -0x68(%rsp), %rdi
        movq      -0x60(%rsp), %rdx
        movq      -0x58(%rsp), %rcx
        movq      -0x50(%rsp), %rbx
        movq      -0x48(%rsp), %rax
        movaps    -0x40(%rsp), %xmm3
        movaps    -0x30(%rsp), %xmm2
        movaps    -0x20(%rsp), %xmm1
        movaps    -0x10(%rsp), %xmm0

.section .text
        .globl     execute_baseline
        .type      execute_baseline, STT_FUNC
# void execute_baseline(
#       int times,
#       void (start_counters)(void),
#       void (stop_counters)(void)
# );
        push       %rbp
        cmp        $0, %rdi
        je        .LloopEnd
        call       *%rsi
        call       *%rdx
        sub        $1, %rdi
        jg         .LloopStart

        popq       %rbp

Example test-harness code

The following is a simple example of how you might implement a test to profile the performance of the gettimeofday function. It really is that simple.

Example test harness

#include <sys/time.h>
#include <stdlib.h>

void execute_test(void (start_counters)(void), void (stop_counters)(void))
    struct timeval tv;
    int i;
    for (i = 1 ; i < 10 ; i++) {
        gettimeofday(&tv, NULL);

Next time I'll put it all together and run some performance tests...

No comments:

Post a Comment