From 330edbe8eb36bee43cfb3408317eda14b6674a81 Mon Sep 17 00:00:00 2001 From: Ciro Santilli Date: Thu, 5 Apr 2018 08:52:30 +0100 Subject: [PATCH] Reorganize the benchmark section. Automate Linux kernel measures and move them into that new section. --- README.adoc | 298 ++++++++++++++++++------------ common | 11 +- eeval | 4 + gem5-bench-cache | 63 +++++++ gem5-stat | 25 +++ parsec-benchmark/parsec-benchmark | 2 +- 6 files changed, 277 insertions(+), 126 deletions(-) create mode 100755 eeval create mode 100755 gem5-bench-cache create mode 100755 gem5-stat diff --git a/README.adoc b/README.adoc index 8aefe3b1..40772eb4 100644 --- a/README.adoc +++ b/README.adoc @@ -1681,11 +1681,32 @@ Results (boot not excluded): [options="header"] |=== |Commit |Arch |Simulator |Instruction count -|7228f75ac74c896417fb8c5ba3d375a14ed4d36b |arm |QEMU |680k -|7228f75ac74c896417fb8c5ba3d375a14ed4d36b |arm |gem5 AtomicSimpleCPU |160M -|7228f75ac74c896417fb8c5ba3d375a14ed4d36b |arm |gem5 HPI |155M -|7228f75ac74c896417fb8c5ba3d375a14ed4d36b |x86_64 |QEMU |3M -|7228f75ac74c896417fb8c5ba3d375a14ed4d36b |x86_64 |gem5 AtomicSimpleCPU |528M + +|7228f75ac74c896417fb8c5ba3d375a14ed4d36b +|arm +|QEMU +|680k + +|7228f75ac74c896417fb8c5ba3d375a14ed4d36b +|arm +|gem5 AtomicSimpleCPU +|160M + +|7228f75ac74c896417fb8c5ba3d375a14ed4d36b +|arm +|gem5 HPI +|155M + +|7228f75ac74c896417fb8c5ba3d375a14ed4d36b +|x86_64 +|QEMU +|3M + +|7228f75ac74c896417fb8c5ba3d375a14ed4d36b +|x86_64 +|gem5 AtomicSimpleCPU +|528M + |=== QEMU: @@ -2308,10 +2329,10 @@ Using text mode is another workaround if you don't need GUI features. gem5 is a system simulator, much <>: http://gem5.org/ -For the most part, just add the `-g` option to the QEMU commands and everything should magically work: +For the most part, just add the `-g` option to all commands and everything should magically work: .... -./configure -g && ./build -a arm -g && ./run -a arm -g +./configure -g && ./build -a aarch64 -g && ./run -a aarch64 -g .... On another shell: @@ -2365,95 +2386,6 @@ This suits chip makers that want to distribute forks with secret IP to their cus + On the other hand, the chip makers tend to upstream less, and the project becomes more crappy in average :-) -==== gem5 vs QEMU performance - -We have benchmarked a Linux kernel boot with the commands: - -.... -# Try to manually hit Ctrl + C as soon as system shutdown message appears. -time ./run -a arm -e 'init=/poweroff.out' -time ./run -a arm -E 'm5 exit' -g -time ./run -a arm -E 'm5 exit' -g -- --caches --cpu-type=HPI -time ./run -a x86_64 -e 'init=/poweroff.out' -time ./run -a x86_64 -e 'init=/poweroff.out' -- -enable-kvm -time ./run -a x86_64 -e 'init=/poweroff.out' -g -.... - -and the results were: - -[options="header"] -|=== -|Arch |Emulator |Subtype |Time |N times slower than QEMU |Instruction count |Commit - -|arm -|QEMU -| -|6 seconds -|1 -| -|da79d6c6cde0fbe5473ce868c9be4771160a003b - -|arm -|gem5 -|AtomicSimpleCPU -|1 minute 40 seconds -|17 -| -|da79d6c6cde0fbe5473ce868c9be4771160a003b - -|arm -|gem5 -|HPI -|10 minutes -|100 -| -|da79d6c6cde0fbe5473ce868c9be4771160a003b - -|aarch64 -|QEMU -| -|1.3 seconds -|1 -|170k -|b6e8a7d1d1cb8a1d10d57aa92ae66cec9bfb2d01 - -|aarch64 -|gem5 -|AtomicSimpleCPU -|1 minute -|43 -|110M -|b6e8a7d1d1cb8a1d10d57aa92ae66cec9bfb2d01 - -|x86_64 -|QEMU -| -|3.8 seconds -|1 -|1.8M -|4cb8a543eeaf7322d2e4493f689735cb5bfd48df - -|x86_64 -|QEMU -|KVM -|1.3 seconds -|0.3 -| -|4cb8a543eeaf7322d2e4493f689735cb5bfd48df - -|x86_64 -|gem5 -|AtomicSimpleCPU -|6 minutes 30 seconds -|102 -|630M -|4cb8a543eeaf7322d2e4493f689735cb5bfd48df -|=== - -tested on the <>. - -One methodology problem is that gem5 and QEMU were run with different kernel configs, due to <>. This could have been improved if we normalized by instruction counts, but we didn't think of that previously. - === gem5 run benchmark OK, this is why we used gem5 in the first place, performance measurements! @@ -2463,25 +2395,29 @@ Let's benchmark https://en.wikipedia.org/wiki/Dhrystone[Dhrystone] which Buildro The most flexible way is to do: .... +arch=aarch64 + # Generate a checkpoint after Linux boots. # The boot takes a while, be patient young Padawan. printf 'm5 exit' >readfile.gitignore -./run -a aarch64 -g -E 'm5 checkpoint;m5 readfile > a.sh;sh a.sh' +./run -a "$arch" -g -E 'm5 checkpoint;m5 readfile > a.sh;sh a.sh' # Restore the checkpoint, and run the benchmark with parameter 1.000. # We skip the boot completely, saving time! printf 'm5 resetstats;dhrystone 1000;m5 exit' >readfile.gitignore -./run -a aarch64 -g -- -r 1 -./gem5-ncycles -a aarch64 +./run -a "$arch" -g -- -r 1 +./gem5-stat -a "$arch" # Now with another parameter 10.000. printf 'm5 resetstats;dhrystone 10000;m5 exit' >readfile.gitignore -./run -a aarch64 -g -- -r 1 -./gem5-ncycles -a aarch64 +./run -a "$arch" -g -- -r 1 +./gem5-stat -a "$arch" .... These commands output the approximate number of CPU cycles it took Dhrystone to run. +For more serious tests, you will likely want to automate logging the commands ran and results to files, a good example is: link:gem5-bench-cache[]. + A more naive and simpler to understand approach would be a direct: .... @@ -2579,6 +2515,7 @@ But keep in mind that it only affects benchmark performance of the most detailed |ARM |`HPI` |yes + |=== {empty}*: couldn't test because of: @@ -3704,8 +3641,12 @@ I put an `echo f` in `check_bin_arch`, and it just loops forever, does not stop In this section document how fast the build and clone are, and how to investigate them. +This is to give an idea to people of what they should expect. + Send a pull request if you try it out on something significantly different. +Ideally, we should setup an automated build server that benchmarks those things continuously for us. + === Find which packages are making the build slow .... @@ -3736,27 +3677,135 @@ We do our best to reduce the instruction and feature count to the bare minimum n + One possibility we could play with is to build loadable modules instead of built-in modules to reduce runtime, but make it easier to get started with the modules. -=== Benchmark machines +=== Benchmark this repo benchmarks -The build times are calculated after doing link:https://buildroot.org/downloads/manual/manual.html#_offline_builds[`make source`], which downloads the sources, and basically benchmarks the Internet. +==== Benchmark Linux kernel boot -https://stackoverflow.com/questions/47997565/gem5-system-requirements-for-decent-performance/48941793#48941793 +.... +./bench-boot +.... -==== P51 +Output: -Lenovo ThinkPad link:https://www3.lenovo.com/gb/en/laptops/thinkpad/p-series/P51/p/22TP2WPWP51[P51 laptop]: +.... +cmd ./run -a arm -E '/poweroff.out' +time 6.77 +cmd ./run -a arm -E 'm5 exit' -g +time 146.96 +insts 230209017 +cmd ./run -a arm -E 'm5 exit' -g -- --caches --cpu-type=HPI +time > 3600 +insts > 373227765 +cmd ./run -a aarch64 -E '/poweroff.out' +time 1.28 +cmd ./run -a aarch64 -E 'm5 exit' -g +time 57.77 +insts 111512915 +cmd ./run -a aarch64 -E 'm5 exit' -g -- --caches --cpu-type=HPI +time 360.90 +insts 111655309 +cmd ./run -a x86_64 -E '/poweroff.out' +time 3.50 +cmd ./run -a x86_64 -E '/poweroff.out' -- -enable-kvm +time 1.30 +cmd ./run -a x86_64 -E 'm5 exit' -g +time 376.03 +insts 634548425 +.... -* 2500 USD in 2018 (high end) -* Intel Core i7-7820HQ Processor (8MB Cache, up to 3.90GHz) (4 cores 8 threads) -* 32GB(16+16) DDR4 2400MHz SODIMM -* 512GB SSD PCIe TLC OPAL2 -* Ubuntu 17.10 +For ARM `arm` QEMU, we just try to manually hit Ctrl + C as soon as system shutdown message appears: <>. + +The results on the <> were: + +[options="header"] +|=== +|Arch |Emulator |Subtype |Time |N times slower than QEMU |Instruction count |Commit + +|arm +|QEMU +| +|6 seconds +|1 +| +|da79d6c6cde0fbe5473ce868c9be4771160a003b + +|arm +|gem5 +|AtomicSimpleCPU +|1 minute 40 seconds +|17 +| +|da79d6c6cde0fbe5473ce868c9be4771160a003b + +|arm +|gem5 +|HPI +|10 minutes +|100 +| +|da79d6c6cde0fbe5473ce868c9be4771160a003b + +|aarch64 +|QEMU +| +|1.3 seconds +|1 +|170k +|b6e8a7d1d1cb8a1d10d57aa92ae66cec9bfb2d01 + +|aarch64 +|gem5 +|AtomicSimpleCPU +|1 minute +|43 +|110M +|b6e8a7d1d1cb8a1d10d57aa92ae66cec9bfb2d01 + +|aarch64 +|gem5 +|HPI +|6 minutes 4 seconds +| +|534.812.447 +|f482f24f87e2b1814ea9ed74f2c87ab30a4cb019 + +|x86_64 +|QEMU +| +|3.8 seconds +|1 +|1.8M +|4cb8a543eeaf7322d2e4493f689735cb5bfd48df + +|x86_64 +|QEMU +|KVM +|1.3 seconds +|0.3 +| +|4cb8a543eeaf7322d2e4493f689735cb5bfd48df + +|x86_64 +|gem5 +|AtomicSimpleCPU +|6 minutes 30 seconds +|102 +|630M +|4cb8a543eeaf7322d2e4493f689735cb5bfd48df + +|=== + +One methodology problem is that some gem5 and QEMU were run with different kernel configs, due to <>. This could have been improved if we normalized by instruction counts, but to do that we would have to enable tracing which makes QEMU run much slower. + +==== Benchmark initial build + +The build times are calculated after doing `./configure` and link:https://buildroot.org/downloads/manual/manual.html#_offline_builds[`make source`], which downloads the sources, and basically benchmarks the Internet. Build time at 2c12b21b304178a81c9912817b782ead0286d282: 28 minutes, 15 with full ccache hits. Breakdown: 19% GCC, 13% Linux kernel, 7% uclibc, 6% host-python, 5% host-qemu, 5% host-gdb, 2% host-binutils Single file change on `./build kernel_module-reconfigure`: 7 seconds. -===== P51 baseline benchmarks +==== Benchmark Buildroot build baseline This is the minimal build we could expect to get away with. @@ -3785,17 +3834,23 @@ This is consistent with the fact that ccache reduces the build time only partial The instructions counts varied very little between the baseline and LKMC, so runtime overhead is not a big deal apparently. -==== P51 gem5 +==== Benchmark gem5 build -How long it takes to build gem5 itself: +How long it takes to build gem5 itself on <> * x86 at 68af229490fc811aebddf68b3e2e09e63a5fa475: 9m40s -==== T430 +=== Benchmark machines -Build time: 2 hours. +==== P51 -TODO specs, SHA. +Lenovo ThinkPad link:https://www3.lenovo.com/gb/en/laptops/thinkpad/p-series/P51/p/22TP2WPWP51[P51 laptop]: + +* 2500 USD in 2018 (high end) +* Intel Core i7-7820HQ Processor (8MB Cache, up to 3.90GHz) (4 cores 8 threads) +* 32GB(16+16) DDR4 2400MHz SODIMM +* 512GB SSD PCIe TLC OPAL2 +* Ubuntu 17.10 === Benchmark Internets @@ -3808,6 +3863,15 @@ TODO specs, SHA. Google M-lab speed test: 36.4Mbps +=== Benchmark this repo bibliography + +gem5: + +* link:https://www.mail-archive.com/gem5-users@gem5.org/msg15262.html[] which parts of the gem5 code make it slow +* what are the minimum system requirements: +** https://stackoverflow.com/questions/47997565/gem5-system-requirements-for-decent-performance/48941793#48941793 +** https://github.com/gem5/gem5/issues/25 + == Conversation === kmod diff --git a/common b/common index 1afc470e..f4066900 100644 --- a/common +++ b/common @@ -1,19 +1,12 @@ #!/usr/bin/env bash -eeval() ( - cmd="$1" - echo "$cmd" | tee -a "${2:-/dev/null}" - eval "$cmd" -) set_common_vars() { arch="$1" gem5="$2" - root_dir="$(pwd)" buildroot_dir="${root_dir}/buildroot" arch_dir="$arch" if "$gem5" && [ ! "$arch" = aarch64 ]; then arch_dir="${arch}-gem5" fi - out_dir="${root_dir}/out" out_arch_dir="${out_dir}/${arch_dir}" buildroot_out_dir="${out_arch_dir}/buildroot" build_dir="${buildroot_out_dir}/build" @@ -21,8 +14,10 @@ set_common_vars() { gem5_out_dir="${out_arch_dir}/gem5" m5out_dir="${gem5_out_dir}/m5out" qemu_out_dir="${out_arch_dir}/qemu" - common_dir="${out_dir}/common" } +root_dir="$(pwd)" +out_dir="${root_dir}/out" +common_dir="${out_dir}/common" f=cli.gitignore if [ -f "$f" ]; then . "$f" diff --git a/eeval b/eeval new file mode 100755 index 00000000..7cef72b9 --- /dev/null +++ b/eeval @@ -0,0 +1,4 @@ +#!/usr/bin/env bash +cmd="$1" +echo "$cmd" | tee -a "${2:-/dev/null}" +eval "$cmd" diff --git a/gem5-bench-cache b/gem5-bench-cache new file mode 100755 index 00000000..330fda16 --- /dev/null +++ b/gem5-bench-cache @@ -0,0 +1,63 @@ +#!/usr/bin/env bash +set -eu +. common +while getopts a:gh OPT; do + case "$OPT" in + a) + arch="$OPTARG" + ;; + esac +done +shift "$(($OPTIND - 1))" + +# Vars +set_common_vars "$arch" true +cmd="./run -a $arch -g" +cpt="-E 'm5 checkpoint;m5 readfile > a.sh;sh a.sh'" +cache_small='--caches --l2cache --l1d_size=1024 --l1i_size=1024 --l2_size=1024 --l3_size=1024' +cache_large='--caches --l2cache --l1d_size=1024kB --l1i_size=1024kB --l2_size=1024kB --l3_size=1024kB' +result_file="${gem5_out_dir}/bench-cache.txt" + +bench() ( + cmd="$1" + eeval "$cmd" "$result_file" + ./gem5-ncycles -a "$arch" >> "$result_file" +) + +bench-all() ( + bench "$cmd -- -r 1" + bench "$cmd -- -r 2 $cache_small" + bench "$cmd -- -r 3 $cache_large" + bench "$cmd -- -r 4 $cache_small --cpu-type=HPI" + bench "$cmd -- -r 5 $cache_large --cpu-type=HPI" +) + +# Files. +rm -rf \ + "$result_file" \ + "${m5out_dir}/cpt.*" \ +; + +# Create the checkpoints after the kernel boot. +printf 'm5 exit' >readfile.gitignore +eeval "$cmd $cpt" +eeval "$cmd $cpt -- $cache_small" +eeval "$cmd $cpt -- $cache_large" +eeval "$cmd $cpt -- $cache_small --cpu-type=HPI" +eeval "$cmd $cpt -- $cache_large --cpu-type=HPI" + +# dhrystone 1.000 +printf '#!/bin/sh +m5 resetstats +dhrystone 1000 +m5 exit +' >readfile.gitignore +bench-all + +# dhrystone 10.000 +sed -Ei 's/^dhrystone .*/dhrystone 10000/' readfile.gitignore +bench-all + +# dhrystone 100.000 +sed -Ei 's/^dhrystone .*/dhrystone 100000/' readfile.gitignore +bench-all diff --git a/gem5-stat b/gem5-stat new file mode 100755 index 00000000..70ec4199 --- /dev/null +++ b/gem5-stat @@ -0,0 +1,25 @@ +#!/usr/bin/env bash +set -eu +. common +while getopts a:hs: OPT; do + case "$OPT" in + a) + arch="$OPTARG" + ;; + h) + printf "\ +usage: $0 [-a arch] [stat=system.cpu.numCycles] +Get the value for a gem5 stat from the stats.txt file. +" 1>&2 + exit + ;; + esac +done +shift "$(($OPTIND - 1))" +if [ $# -gt 0 ]; then + stat="$1" +else + stat=system.cpu.numCycles +fi +set_common_vars "$arch" true +awk "/^$stat /{ print \$2 }" "${m5out_dir}/stats.txt" diff --git a/parsec-benchmark/parsec-benchmark b/parsec-benchmark/parsec-benchmark index 05c650df..f1b8a70c 160000 --- a/parsec-benchmark/parsec-benchmark +++ b/parsec-benchmark/parsec-benchmark @@ -1 +1 @@ -Subproject commit 05c650df71d6aba890421b23374477abf7a392e8 +Subproject commit f1b8a70c7930fdd150649dfe43f0ea3b27f7937b