From d4a27987d6d74014d65623f4a2cf9429627066f8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ciro=20Santilli=20=E5=85=AD=E5=9B=9B=E4=BA=8B=E4=BB=B6=20?= =?UTF-8?q?=E6=B3=95=E8=BD=AE=E5=8A=9F?= Date: Wed, 18 Nov 2020 00:00:00 +0000 Subject: [PATCH] play a bit with the ARM PMU --- README.adoc | 46 ++++++++++++++++++---- lkmc.c | 11 ++++++ lkmc.h | 6 +++ path_properties.py | 7 +++- userland/cpp/custom_iterator.cpp | 12 ++++++ userland/gcc/busy_loop.c | 15 +------ userland/linux/perf_event_open.c | 67 ++++++++++++++++++++++++++++++++ 7 files changed, 142 insertions(+), 22 deletions(-) create mode 100644 userland/linux/perf_event_open.c diff --git a/README.adoc b/README.adoc index e1c30ced..0fd3f8e3 100644 --- a/README.adoc +++ b/README.adoc @@ -10206,6 +10206,7 @@ TODO exercise DMA on the kernel module. The `edu` hardware model has that featur * https://stackoverflow.com/questions/32592734/are-there-any-dma-driver-example-pcie-and-fpga/44716747#44716747 * https://stackoverflow.com/questions/62831327/add-memory-device-to-qemu * https://stackoverflow.com/questions/64539528/qemu-pci-dma-read-and-pci-dma-write-does-not-work +* https://stackoverflow.com/questions/64842929/general-protection-error-while-tring-to-perform-ioctl ===== Manipulate PCI registers directly @@ -21881,6 +21882,16 @@ and: Due to the way that <> however, the output is more deterministic in that case, see that section for further details. +[[perf-event-open]] +==== `perf_event_open` system call + +link:userland/linux/perf_event_open.c[] counts instructions of a given loop: https://stackoverflow.com/questions/13313510/quick-way-to-count-number-of-instructions-executed-in-a-c-program/64863392#64863392 + +Bibliography: + +* `man perf_event_open` +* https://community.arm.com/developer/ip-products/system/b/embedded-blog/posts/using-the-arm-performance-monitor-unit-pmu-linux-driver + === Linux calling conventions A summary of results is shown at: xref:table-linux-calling-conventions[xrefstyle=full]. @@ -22758,15 +22769,10 @@ produces: There is also the RDPID instruction that reads just the processor ID, but it appears to be very new for QEMU 4.0.0 or <>, as it fails with SIGILL on both. -Bibliography: https://stackoverflow.com/questions/22310028/is-there-an-x86-instruction-to-tell-which-core-the-instruction-is-being-run-on/56622112#56622112 +Bibliography: -===== ARM PMCCNTR register - -TODO We didn't manage to find a working ARM analogue to <>: link:kernel_modules/pmccntr.c[] is oopsing, and even it if weren't, it likely won't give the cycle count since boot since it needs to be activate before it starts counting anything: - -* https://stackoverflow.com/questions/40454157/is-there-an-equivalent-instruction-to-rdtsc-in-arm -* https://stackoverflow.com/questions/31620375/arm-cortex-a7-returning-pmccntr-0-in-kernel-mode-and-illegal-instruction-in-u/31649809#31649809 -* https://blog.regehr.org/archives/794 +* ARM has an analogous <> +* https://stackoverflow.com/questions/22310028/is-there-an-x86-instruction-to-tell-which-core-the-instruction-is-being-run-on/56622112#56622112 === x86 thread synchronization primitives @@ -23965,6 +23971,24 @@ Bibliography: * <> +=== ARM PMU + +The PMU (Performance Monitor Unit) is an unit in the ARM CPU that counts performance events of interest. These can be used to benchmark, and sometimes debug, code running on ARM CPUs. + +The <> exposes some (all?) of those events through the arch-agnostic <> system call. + +The PMU is exposed through <>, with registers that start with the prefix `PM*`. + +Bibliography: https://community.arm.com/developer/ip-products/system/b/embedded-blog/posts/using-the-arm-performance-monitor-unit-pmu-linux-driver + +==== ARM PMCCNTR register + +TODO We didn't manage to find a working ARM analogue to <>: link:kernel_modules/pmccntr.c[] is oopsing, and even it if weren't, it likely won't give the cycle count since boot since it needs to be activate before it starts counting anything: + +* https://stackoverflow.com/questions/40454157/is-there-an-equivalent-instruction-to-rdtsc-in-arm +* https://stackoverflow.com/questions/31620375/arm-cortex-a7-returning-pmccntr-0-in-kernel-mode-and-illegal-instruction-in-u/31649809#31649809 +* https://blog.regehr.org/archives/794 + === ARM assembly bibliography ==== ARM non-official bibliography @@ -28171,6 +28195,12 @@ Those files also contain arch specific helpers under ifdefs like: We try to keep as much as possible in those files. It bloats builds a little, but just makes everything simpler to understand. +Link with lkmc.o is enabled with the <> + +.... +'extra_objs_lkmc_common': False, +.... + [[lkmc-home]] ==== lkmc_home diff --git a/lkmc.c b/lkmc.c index 4a83d2f1..7c686155 100644 --- a/lkmc.c +++ b/lkmc.c @@ -57,6 +57,17 @@ void lkmc_assert_memcmp( } } +void __attribute__ ((noinline)) lkmc_busy_loop( + unsigned long long max, + unsigned long long max2 +) { + for (unsigned long long i = 0; i < max2; i++) { + for (unsigned long long j = 0; j < max; j++) { + __asm__ __volatile__ ("" : "+g" (i), "+g" (j) : :); + } + } +} + void lkmc_print_hex_32(uint32_t x) { printf("0x%08" PRIX32, x); } diff --git a/lkmc.h b/lkmc.h index 6ccf8f6c..df2bd399 100644 --- a/lkmc.h +++ b/lkmc.h @@ -34,6 +34,12 @@ void lkmc_assert_memcmp(const void *s1, const void *s2, size_t n, uint32_t line) /* Temporary per C source file name that our examples can safely create. */ #define LKMC_TMP_FILE __FILE__ LKMC_TMP_EXT #define LKMC_TMP_FILE_NAMED(name) __FILE__ "__" name LKMC_TMP_EXT + +/* https://cirosantilli.com/linux-kernel-module-cheat#c-busy-loop */ +void __attribute__ ((noinline)) lkmc_busy_loop( + unsigned long long max, + unsigned long long max2 +); #endif /* Assert that the given branch instruction is taken. */ diff --git a/path_properties.py b/path_properties.py index 0a2e41a0..ff8804a4 100644 --- a/path_properties.py +++ b/path_properties.py @@ -49,6 +49,7 @@ class PathProperties: # added to baremetal examples. 'extra_objs_disable_baremetal_bootloader': False, # We should get rid of this if we ever properly implement dependency graphs. + # Enable: https://cirosantilli.com/linux-kernel-module-cheat#lkmc-c 'extra_objs_lkmc_common': False, 'freestanding': False, 'gem5_unimplemented_instruction': False, @@ -735,7 +736,10 @@ path_properties_tuples = ( 'gcc': ( {**gnu_extension_properties, **{'cc_pedantic': False}}, { - 'busy_loop.c': {'baremetal': True}, + 'busy_loop.c': { + 'baremetal': True, + 'extra_objs_lkmc_common': True, + }, 'openmp.c': {'cc_flags': ['-fopenmp', LF]}, } ), @@ -783,6 +787,7 @@ path_properties_tuples = ( 'gem5_unimplemented_syscall': True }, 'pagemap_dump.c': {'requires_argument': True}, + 'perf_event_open.c': {'extra_objs_lkmc_common': True}, 'poweroff.c': {'requires_sudo': True}, 'proc_events.c': {'requires_sudo': True}, 'proc_events.c': {'requires_sudo': True}, diff --git a/userland/cpp/custom_iterator.cpp b/userland/cpp/custom_iterator.cpp index 7f1e98ca..3087d787 100644 --- a/userland/cpp/custom_iterator.cpp +++ b/userland/cpp/custom_iterator.cpp @@ -28,6 +28,11 @@ class MyMap { auto pair = *it; return std::make_pair(2*pair.first, 3*pair.second); } + // TODO. How to return that new object by address? + //value_type& operator->() { + // auto pair = *it; + // return std::make_pair(2*pair.first, 3*pair.second); + //} }; iterator begin() { return iterator(map.begin()); } iterator end() { return iterator(map.end()); } @@ -44,7 +49,14 @@ int main() { assert((*it++ == std::pair(2, 33))); assert((*it++ == std::pair(4, 36))); + // TODO operator->() + it = map.begin(); + //assert((it->first == 0)); + auto stl_it = map.map.begin(); + assert((stl_it->first == 0)); + for (const auto& v : map) { std::cout << v.first << " " << v.second << std::endl; } + } diff --git a/userland/gcc/busy_loop.c b/userland/gcc/busy_loop.c index 8f0802d0..616274d4 100644 --- a/userland/gcc/busy_loop.c +++ b/userland/gcc/busy_loop.c @@ -2,18 +2,7 @@ * https://cirosantilli.com/linux-kernel-module-cheat#c-busy-loop * https://cirosantilli.com/linux-kernel-module-cheat#benchmark-emulators-on-userland-executables */ -#include - -void __attribute__ ((noinline)) busy_loop( - unsigned long long max, - unsigned long long max2 -) { - for (unsigned long long i = 0; i < max2; i++) { - for (unsigned long long j = 0; j < max; j++) { - __asm__ __volatile__ ("" : "+g" (i), "+g" (j) : :); - } - } -} +#include int main(int argc, char **argv) { unsigned long long max, max2; @@ -27,5 +16,5 @@ int main(int argc, char **argv) { } else { max2 = 1; } - busy_loop(max, max2); + lkmc_busy_loop(max, max2); } diff --git a/userland/linux/perf_event_open.c b/userland/linux/perf_event_open.c new file mode 100644 index 00000000..afad7748 --- /dev/null +++ b/userland/linux/perf_event_open.c @@ -0,0 +1,67 @@ +/* https://cirosantilli.com/linux-kernel-module-cheat#perf-event-open + * + * Malloc n bytes as given from the command line. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include + +static long +perf_event_open(struct perf_event_attr *hw_event, pid_t pid, + int cpu, int group_fd, unsigned long flags) +{ + int ret; + + ret = syscall(__NR_perf_event_open, hw_event, pid, cpu, + group_fd, flags); + return ret; +} + +int +main(int argc, char **argv) +{ + struct perf_event_attr pe; + long long count; + int fd; + + uint64_t n; + if (argc > 1) { + n = strtoll(argv[1], NULL, 0); + } else { + n = 100; + } + + memset(&pe, 0, sizeof(struct perf_event_attr)); + pe.type = PERF_TYPE_HARDWARE; + pe.size = sizeof(struct perf_event_attr); + pe.config = PERF_COUNT_HW_INSTRUCTIONS; + pe.disabled = 1; + pe.exclude_kernel = 1; + // Don't count hypervisor events. + pe.exclude_hv = 1; + + fd = perf_event_open(&pe, 0, -1, -1, 0); + if (fd == -1) { + fprintf(stderr, "Error opening leader %llx\n", pe.config); + exit(EXIT_FAILURE); + } + + ioctl(fd, PERF_EVENT_IOC_RESET, 0); + ioctl(fd, PERF_EVENT_IOC_ENABLE, 0); + + lkmc_busy_loop(n, 1); + + ioctl(fd, PERF_EVENT_IOC_DISABLE, 0); + read(fd, &count, sizeof(long long)); + + printf("Used %lld instructions\n", count); + + close(fd); +}