Intro
This report presents a performance comparison between two different QEMU builds, GCC and Clang. To provide a variety of test workloads, five new benchmarks are also introduced in the report. For each benchmark, the performance of QEMU is compared across the two builds for seventeen different targets.
The compiler versions used in the report are the default available versions in the Ubuntu 18.0 repositories which are 7.5.0 for GCC and 6.0.0 for Clang.
Table of Contents
- Benchmarks Overview
- Setup and Prerequisites
- Performance Tables
- Analysis of Results
- QEMU Binary Size
- Appendix
Benchmarks Overview
This section gives a quick overview of the five new benchmarks used in the report. All benchmarks are available on the project GitHub page.
matmult_double:
Standard matrix multiplication of an n*n matrix of randomly generated double numbers from 0 to 100. The value of n is passed as an argument with the -n
flag. The default value is 200.
matmult_int32:
Standard matrix multiplication of an n*n matrix of randomly generated integer numbers from 0 to 100. The value of n is passed as an argument with the -n
flag. The default value is 200.
qsort_double:
Quick sort of an array of n randomly generated double numbers from 0 to 1000. The value of n is passed as an argument with the -n
flag. The default value is 300000.
qsort_int32:
Quick sort of an array of n randomly generated integer numbers from 0 to 50000000. The value of n is passed as an argument with the -n
flag. The default value is 300000.
qsort_string:
Quick sort of an array of 10000 randomly generated strings of size 8 (including null terminating character). The sort process is repeated n number of times. The value of n is passed as an argument with the -n
flag. The default value is 20.
Setup and Prerequisites
All previous reports assumed a GCC build on top of QEMU 5.0.0. Given that QEMU 5.1.0-rc1 is now released, the report will be based on this newer QEMU version.
To download and make the GCC and Clang builds for QEMU, you can run the bash snippet below:
wget https://download.qemu.org/qemu-5.1.0-rc1.tar.xz
tar xfv qemu-5.1.0-rc1.tar.xz
cd qemu-5.1.0-rc1
mkdir build-gcc
cd build-gcc
../configure
make
cd ..
mkdir build-clang
../configure --cc=clang --cxx=clang++
make
The report will assume that the GCC build is at <qemu-gcc-build>
and Clang build is at <qemu-clang-build>
.
To measure the performance of the seventeen targets for all of the five benchmarks and the two QEMU builds, the Python script below is used. The script runs dissect.py
(which was modified to print the number of instructions instead of percentages) on each of the five benchmarks for the two different builds. The results are ten tables, two for each benchmark.
The script assumes that all five benchmarks are in a benchmarks
directory.
import csv
import os
import subprocess
############### Script Options ###############
builds = {
"gcc": "<qemu-gcc-build>",
"clang": "<qemu-clang-build>"
}
targets = {
"aarch64": "aarch64-linux-gnu-gcc",
"alpha": "alpha-linux-gnu-gcc",
"arm": "arm-linux-gnueabi-gcc",
"hppa": "hppa-linux-gnu-gcc",
"m68k": "m68k-linux-gnu-gcc",
"mips": "mips-linux-gnu-gcc",
"mipsel": "mipsel-linux-gnu-gcc",
"mips64": "mips64-linux-gnuabi64-gcc",
"mips64el": "mips64el-linux-gnuabi64-gcc",
"ppc": "powerpc-linux-gnu-gcc",
"ppc64": "powerpc64-linux-gnu-gcc",
"ppc64le": "powerpc64le-linux-gnu-gcc",
"riscv64": "riscv64-linux-gnu-gcc",
"s390x": "s390x-linux-gnu-gcc",
"sh4": "sh4-linux-gnu-gcc",
"sparc64": "sparc64-linux-gnu-gcc",
"x86_64": "gcc"
}
##############################################
benchmarks = sorted(os.listdir("benchmarks"))
csv_headers = ["Target", "Total Instructions",
"Code Generation", "JIT Execution", "Helpers"]
for benchmark in benchmarks:
# Remove file extension
benchmark_name = os.path.splitext(benchmark)[0]
for build_name, _ in builds.items():
with open("tables/{}-{}.csv".format(benchmark_name, build_name), "w") as csv_file:
writer = csv.writer(csv_file)
writer.writerow(csv_headers)
for target_name, target_compiler in targets.items():
compile_target = subprocess.run([target_compiler,
"-O2",
"-static",
"benchmarks/"+benchmark,
"-o",
"/tmp/" + benchmark_name])
for build_name, build_path in builds.items():
dissect_target = subprocess.run(["./dissect.py",
"--",
"{}/{}-linux-user/qemu-{}".
format(build_path,
target_name,
target_name),
"/tmp/" + benchmark_name],
stdout=subprocess.PIPE)
# Read the dissect output
lines = dissect_target.stdout.decode("utf-8").split('\n')
# Extract measurements
total_instructions = lines[0].split()[-1]
code_generation = lines[2].split()[-2]
jit_execution = lines[3].split()[-2]
helpers_execution = lines[4].split()[-2]
# Save output to CSV
with open("tables/{}-{}.csv".format(benchmark_name, build_name), "a") as csv_file:
writer = csv.writer(csv_file)
writer.writerow([target_name,
total_instructions,
code_generation,
jit_execution,
helpers_execution])
# Remove temporary file
os.unlink("/tmp/" + benchmark_name)
The ten resulting tables are shown in the next section. For Clang tables, numbers in green indicate a decrease in the number of instructions, and numbers in red indicate otherwise.
Performance Tables
matmult_double (GCC)
Target | Total Instructions | Code Generation | JIT Execution | Helpers |
---|---|---|---|---|
aarch64 | 1 411 957 967 | 76 120 077 | 611 067 999 | 724 769 891 |
alpha | 3 020 752 954 | 56 143 853 | 437 106 648 | 2 527 502 453 |
arm | 8 721 987 230 | 285 174 602 | 6 633 909 955 | 1 802 902 673 |
hppa | 3 345 168 851 | 179 176 839 | 348 961 284 | 2 817 030 728 |
m68k | 3 327 223 458 | 65 872 033 | 547 273 911 | 2 714 077 514 |
mips | 2 262 859 230 | 79 312 482 | 362 219 758 | 1 821 326 990 |
mipsel | 3 176 135 194 | 79 192 739 | 402 266 174 | 2 694 676 281 |
mips64 | 2 276 881 008 | 87 067 585 | 364 561 337 | 1 825 252 086 |
mips64el | 3 189 604 541 | 86 891 148 | 404 111 884 | 2 698 601 509 |
ppc | 3 125 097 209 | 82 035 165 | 338 132 356 | 2 704 929 688 |
ppc64 | 3 202 929 198 | 88 654 395 | 379 358 099 | 2 734 916 704 |
ppc64le | 3 202 302 439 | 88 841 776 | 378 540 677 | 2 734 919 986 |
riscv64 | 1 222 310 471 | 60 743 087 | 305 444 092 | 856 123 292 |
s390x | 2 726 250 005 | 57 481 812 | 318 742 202 | 2 350 025 991 |
sh4 | 3 341 872 364 | 67 475 044 | 602 524 473 | 2 671 872 847 |
sparc64 | 3 205 825 118 | 134 723 352 | 501 019 705 | 2 570 082 061 |
x86_64 | 1 249 941 832 | 67 613 673 | 364 007 574 | 818 320 585 |
matmult_double (Clang)
Target | Total Instructions | Code Generation | JIT Execution | Helpers |
---|---|---|---|---|
aarch64 | 2 011 166 477 | 71 984 566 | 611 067 999 | 1 328 113 912 |
alpha | 4 673 820 741 | 55 207 732 | 437 106 648 | 4 181 506 361 |
arm | 8 746 454 169 | 276 672 219 | 6 633 909 955 | 1 835 871 995 |
hppa | 4 770 815 514 | 175 521 902 | 348 961 284 | 4 246 332 328 |
m68k | 3 542 927 301 | 65 164 233 | 547 273 911 | 2 930 489 157 |
mips | 3 738 305 471 | 72 287 768 | 362 219 758 | 3 303 797 945 |
mipsel | 5 239 048 897 | 72 499 086 | 402 266 174 | 4 764 283 637 |
mips64 | 3 750 825 939 | 77 896 420 | 364 561 337 | 3 308 368 182 |
mips64el | 5 251 030 988 | 78 065 089 | 404 111 884 | 4 768 854 015 |
ppc | 4 109 476 668 | 81 662 092 | 338 132 356 | 3 689 682 220 |
ppc64 | 4 170 489 431 | 87 779 750 | 379 358 099 | 3 703 351 582 |
ppc64le | 4 169 627 378 | 87 731 465 | 378 540 677 | 3 703 355 236 |
riscv64 | 1 799 067 648 | 59 198 155 | 305 444 092 | 1 434 425 401 |
s390x | 3 924 840 571 | 58 080 261 | 318 742 202 | 3 548 018 108 |
sh4 | 5 069 649 275 | 65 326 027 | 602 524 473 | 4 401 798 775 |
sparc64 | 4 918 273 993 | 131 200 185 | 501 019 705 | 4 286 054 103 |
x86_64 | 2 282 484 944 | 66 749 639 | 364 007 574 | 1 851 727 731 |
matmult_int32 (GCC)
Target | Total Instructions | Code Generation | JIT Execution | Helpers |
---|---|---|---|---|
aarch64 | 595 494 162 | 69 380 056 | 505 728 486 | 20 385 620 |
alpha | 370 047 513 | 50 451 964 | 307 209 296 | 12 386 253 |
arm | 735 549 496 | 276 195 690 | 410 431 931 | 48 921 875 |
hppa | 666 639 906 | 171 974 055 | 454 423 118 | 40 242 733 |
m68k | 406 939 070 | 59 081 632 | 329 037 444 | 18 819 994 |
mips | 497 096 141 | 71 679 287 | 416 946 656 | 8 470 198 |
mipsel | 497 010 030 | 71 506 947 | 417 032 765 | 8 470 318 |
mips64 | 478 992 426 | 78 298 258 | 388 302 800 | 12 391 368 |
mips64el | 462 357 562 | 78 074 494 | 371 890 705 | 12 392 363 |
ppc | 338 417 174 | 74 914 476 | 255 198 622 | 8 304 076 |
ppc64 | 390 129 095 | 80 972 586 | 297 023 711 | 12 132 798 |
ppc64le | 390 053 461 | 80 990 281 | 296 928 184 | 12 134 996 |
riscv64 | 349 030 315 | 54 578 504 | 281 826 149 | 12 625 662 |
s390x | 491 822 152 | 51 853 248 | 375 436 514 | 64 532 390 |
sh4 | 399 132 791 | 59 096 149 | 312 448 882 | 27 587 760 |
sparc64 | 488 710 835 | 123 407 698 | 353 794 723 | 11 508 414 |
x86_64 | 399 168 070 | 61 136 986 | 322 978 674 | 15 052 410 |
matmult_int32 (Clang)
Target | Total Instructions | Code Generation | JIT Execution | Helpers |
---|---|---|---|---|
aarch64 | 593 751 025 | 65 694 887 | 505 728 486 | 22 327 652 |
alpha | 371 067 721 | 50 014 259 | 307 209 296 | 13 844 166 |
arm | 727 777 546 | 267 935 229 | 410 431 931 | 49 410 386 |
hppa | 664 473 385 | 168 673 213 | 454 423 118 | 41 377 054 |
m68k | 408 524 487 | 58 641 545 | 329 037 444 | 20 845 498 |
mips | 491 762 457 | 65 531 396 | 416 946 656 | 9 284 405 |
mipsel | 491 996 655 | 65 679 356 | 417 032 765 | 9 284 534 |
mips64 | 472 458 134 | 70 305 442 | 388 302 800 | 13 849 892 |
mips64el | 456 176 576 | 70 434 870 | 371 890 705 | 13 851 001 |
ppc | 339 482 792 | 75 166 396 | 255 198 622 | 9 117 774 |
ppc64 | 391 412 561 | 80 800 275 | 297 023 711 | 13 588 575 |
ppc64le | 391 115 113 | 80 595 928 | 296 928 184 | 13 591 001 |
riscv64 | 349 721 940 | 53 570 099 | 281 826 149 | 14 325 692 |
s390x | 481 356 004 | 52 685 433 | 375 436 514 | 53 234 057 |
sh4 | 399 736 614 | 57 671 842 | 312 448 882 | 29 615 890 |
sparc64 | 486 470 314 | 121 641 278 | 353 794 723 | 11 034 313 |
x86_64 | 399 194 875 | 60 675 932 | 322 978 674 | 15 540 269 |
qsort_double (GCC)
Target | Total Instructions | Code Generation | JIT Execution | Helpers |
---|---|---|---|---|
aarch64 | 2 658 154 250 | 79 040 798 | 1 265 579 424 | 1 313 534 028 |
alpha | 1 949 114 474 | 57 399 955 | 869 643 481 | 1 022 071 038 |
arm | 9 118 694 070 | 850 895 346 | 4 265 464 995 | 4 002 333 729 |
hppa | 3 138 372 515 | 496 702 308 | 1 023 062 954 | 1 618 607 253 |
m68k | 4 385 213 371 | 63 926 269 | 1 224 197 193 | 3 097 089 909 |
mips | 2 098 316 452 | 82 146 339 | 942 372 869 | 1 073 797 244 |
mipsel | 2 098 510 236 | 81 980 330 | 942 732 894 | 1 073 797 012 |
mips64 | 1 970 640 902 | 90 221 912 | 787 224 479 | 1 093 194 511 |
mips64el | 1 968 095 838 | 90 070 316 | 784 830 991 | 1 093 194 531 |
ppc | 2 735 890 533 | 115 892 326 | 1 087 846 009 | 1 532 152 198 |
ppc64 | 2 684 919 199 | 122 371 298 | 1 028 369 561 | 1 534 178 340 |
ppc64le | 2 641 863 052 | 122 545 885 | 985 137 467 | 1 534 179 700 |
riscv64 | 1 589 964 563 | 62 644 918 | 643 559 932 | 883 759 713 |
s390x | 2 474 989 116 | 58 933 857 | 660 401 279 | 1 755 653 980 |
sh4 | 2 562 375 399 | 68 574 230 | 1 138 226 176 | 1 355 574 993 |
sparc64 | 3 917 963 038 | 2 055 155 359 | 1 298 625 002 | 564 182 677 |
x86_64 | 1 986 765 860 | 68 884 527 | 804 594 605 | 1 113 286 728 |
qsort_double (Clang)
Target | Total Instructions | Code Generation | JIT Execution | Helpers |
---|---|---|---|---|
aarch64 | 2 744 907 796 | 74 594 511 | 1 265 579 424 | 1 404 733 861 |
alpha | 2 070 204 037 | 56 286 633 | 869 643 481 | 1 144 273 923 |
arm | 9 163 293 622 | 822 921 830 | 4 265 464 995 | 4 074 906 797 |
hppa | 3 094 186 179 | 484 467 463 | 1 023 062 954 | 1 586 655 762 |
m68k | 4 114 339 886 | 63 107 725 | 1 224 197 193 | 2 827 034 968 |
mips | 2 240 757 989 | 74 728 757 | 942 372 869 | 1 223 656 363 |
mipsel | 2 241 264 168 | 74 875 161 | 942 732 894 | 1 223 656 113 |
mips64 | 2 125 667 762 | 80 578 997 | 787 224 479 | 1 257 864 286 |
mips64el | 2 123 478 152 | 80 782 855 | 784 830 991 | 1 257 864 306 |
ppc | 3 265 774 088 | 120 968 560 | 1 087 846 009 | 2 056 959 519 |
ppc64 | 3 198 309 630 | 127 098 740 | 1 028 369 561 | 2 042 841 329 |
ppc64le | 3 155 027 150 | 127 046 877 | 985 137 467 | 2 042 842 806 |
riscv64 | 1 667 477 270 | 60 857 834 | 643 559 932 | 963 059 504 |
s390x | 2 404 573 110 | 59 331 077 | 660 401 279 | 1 684 840 754 |
sh4 | 2 609 811 199 | 66 186 107 | 1 138 226 176 | 1 405 398 916 |
sparc64 | 4 189 185 205 | 2 351 778 131 | 1 298 625 002 | 538 782 072 |
x86_64 | 1 988 312 647 | 67 801 893 | 804 594 605 | 1 115 916 149 |
qsort_int32 (GCC)
Target | Total Instructions | Code Generation | JIT Execution | Helpers |
---|---|---|---|---|
aarch64 | 2 131 720 142 | 72 399 376 | 1 055 781 197 | 1 003 539 569 |
alpha | 1 460 366 380 | 52 934 795 | 796 943 218 | 610 488 367 |
arm | 3 372 434 621 | 843 360 743 | 1 078 158 662 | 1 450 915 216 |
hppa | 2 196 495 498 | 491 615 054 | 868 099 497 | 836 780 947 |
m68k | 1 779 675 824 | 58 332 907 | 966 478 982 | 754 863 935 |
mips | 1 499 858 843 | 74 197 186 | 842 808 363 | 582 853 294 |
mipsel | 1 502 516 358 | 74 066 097 | 845 597 211 | 582 853 050 |
mips64 | 1 498 104 595 | 84 972 133 | 802 161 902 | 610 970 560 |
mips64el | 1 477 116 988 | 84 679 402 | 781 628 546 | 610 809 040 |
ppc | 1 668 038 700 | 109 657 874 | 975 751 506 | 582 629 320 |
ppc64 | 1 779 152 045 | 115 936 847 | 1 072 790 643 | 590 424 555 |
ppc64le | 1 727 703 061 | 115 826 109 | 1 021 451 041 | 590 425 911 |
riscv64 | 1 289 198 318 | 57 502 772 | 624 840 792 | 606 854 754 |
s390x | 2 114 306 901 | 53 466 647 | 692 707 638 | 1 368 132 616 |
sh4 | 1 878 429 484 | 61 060 371 | 913 781 294 | 903 587 819 |
sparc64 | 3 352 057 480 | 2 022 774 129 | 1 141 078 790 | 188 204 561 |
x86_64 | 1 751 081 171 | 63 007 727 | 765 175 073 | 922 898 371 |
qsort_int32 (Clang)
Target | Total Instructions | Code Generation | JIT Execution | Helpers |
---|---|---|---|---|
aarch64 | 2 223 343 530 | 68 447 052 | 1 055 781 197 | 1 099 115 281 |
alpha | 1 531 490 101 | 52 235 802 | 796 943 218 | 682 311 081 |
arm | 3 370 059 365 | 815 660 694 | 1 078 158 662 | 1 476 240 009 |
hppa | 2 212 542 138 | 479 758 595 | 868 099 497 | 864 684 046 |
m68k | 1 853 277 157 | 57 802 833 | 966 478 982 | 828 995 342 |
mips | 1 549 431 653 | 67 726 753 | 842 808 363 | 638 896 537 |
mipsel | 1 552 417 791 | 67 924 305 | 845 597 211 | 638 896 275 |
mips64 | 1 559 572 362 | 76 178 736 | 802 161 902 | 681 231 724 |
mips64el | 1 539 491 325 | 76 254 175 | 781 628 546 | 681 608 604 |
ppc | 1 730 823 937 | 115 323 201 | 975 751 506 | 639 749 230 |
ppc64 | 1 855 390 899 | 121 324 924 | 1 072 790 643 | 661 275 332 |
ppc64le | 1 803 732 493 | 121 004 644 | 1 021 451 041 | 661 276 808 |
riscv64 | 1 369 588 518 | 56 200 727 | 624 840 792 | 688 546 999 |
s390x | 2 018 273 542 | 54 160 605 | 692 707 638 | 1 271 405 299 |
sh4 | 1 943 201 025 | 59 392 201 | 913 781 294 | 970 027 530 |
sparc64 | 3 626 612 511 | 2 317 128 970 | 1 141 078 790 | 168 404 751 |
x86_64 | 1 759 411 677 | 62 347 319 | 765 175 073 | 931 889 285 |
qsort_string (GCC)
Target | Total Instructions | Code Generation | JIT Execution | Helpers |
---|---|---|---|---|
aarch64 | 2 530 266 115 | 71 878 428 | 1 449 639 434 | 1 008 748 253 |
alpha | 1 793 598 941 | 52 114 495 | 1 117 230 092 | 624 254 354 |
arm | 7 155 712 165 | 3 673 959 444 | 1 747 578 444 | 1 734 174 277 |
hppa | 4 596 662 435 | 2 077 290 104 | 1 188 975 849 | 1 330 396 482 |
m68k | 2 295 149 555 | 58 023 587 | 1 343 485 580 | 893 640 388 |
mips | 2 113 525 462 | 74 968 779 | 1 501 957 602 | 536 599 081 |
mipsel | 2 110 264 854 | 74 820 076 | 1 498 845 813 | 536 598 965 |
mips64 | 1 968 388 319 | 81 540 025 | 1 278 967 522 | 607 880 772 |
mips64el | 1 950 703 742 | 81 332 799 | 1 261 490 179 | 607 880 764 |
ppc | 2 428 873 715 | 269 273 748 | 1 616 960 349 | 542 639 618 |
ppc64 | 2 404 019 471 | 273 403 312 | 1 361 586 990 | 769 029 169 |
ppc64le | 2 385 236 674 | 273 482 576 | 1 342 609 438 | 769 144 660 |
riscv64 | 1 563 526 901 | 56 535 790 | 880 912 912 | 626 078 199 |
s390x | 3 934 202 536 | 52 728 806 | 868 450 778 | 3 013 022 952 |
sh4 | 2 097 991 097 | 61 043 883 | 1 155 615 736 | 881 331 478 |
sparc64 | 4 130 814 212 | 2 078 206 607 | 1 572 858 282 | 479 749 323 |
x86_64 | 2 864 486 422 | 63 565 441 | 1 160 432 349 | 1 640 488 632 |
qsort_string (Clang)
Target | Total Instructions | Code Generation | JIT Execution | Helpers |
---|---|---|---|---|
aarch64 | 2 622 482 230 | 68 022 754 | 1 449 639 434 | 1 104 820 042 |
alpha | 1 866 463 476 | 51 536 810 | 1 117 230 092 | 697 696 574 |
arm | 7 056 218 429 | 3 549 558 538 | 1 747 578 444 | 1 759 081 447 |
hppa | 4 584 536 797 | 2 023 729 867 | 1 188 975 849 | 1 371 831 081 |
m68k | 2 391 766 444 | 57 534 808 | 1 343 485 580 | 990 746 056 |
mips | 2 158 611 852 | 68 459 450 | 1 501 957 602 | 588 194 800 |
mipsel | 2 155 675 997 | 68 635 509 | 1 498 845 813 | 588 194 675 |
mips64 | 2 031 507 933 | 73 143 579 | 1 278 967 522 | 679 396 832 |
mips64el | 2 014 166 270 | 73 279 267 | 1 261 490 179 | 679 396 824 |
ppc | 2 516 388 621 | 303 589 290 | 1 616 960 349 | 595 838 982 |
ppc64 | 2 384 468 962 | 308 642 876 | 1 361 586 990 | 714 239 096 |
ppc64le | 2 365 387 612 | 308 506 861 | 1 342 609 438 | 714 271 313 |
riscv64 | 1 646 638 770 | 55 367 615 | 880 912 912 | 710 358 243 |
s390x | 3 475 927 973 | 53 526 410 | 868 450 778 | 2 553 950 785 |
sh4 | 2 161 178 295 | 59 427 892 | 1 155 615 736 | 946 134 667 |
sparc64 | 4 410 070 085 | 2 377 799 055 | 1 572 858 282 | 459 412 748 |
x86_64 | 2 874 783 326 | 62 956 885 | 1 160 432 349 | 1 651 394 092 |
Analysis of Results
Comparison Script
To facilitate the analysis, another Python script can be used to compare Clang performance to that of GCC for each benchmark. The result would be five tables (one for each benchmark) with the percentage of increase/decrease in Clang instructions compared to GCC.
The script assumes that all the tables from the previous section are in a tables
directory.
import os
import csv
def calculate_change(gcc_instructions, clang_instructions):
# Calculate the percentage of change in Clang instructions compared to GCC
percentage = round(((clang_instructions - gcc_instructions) /
gcc_instructions) * 100, 3)
return "+" + str(percentage) + "%" if percentage > 0 else str(percentage) + "%"
tables = sorted(os.listdir("tables"))
csv_headers = ["Target", "Total Instructions %",
"Code Generation %", "JIT Execution %", "Helpers %"]
for i in range(0, len(tables), 2):
benchmark_name = tables[i].split("-")[0]
# Extract data from tables
clang_data, gcc_data = [], []
with open("tables/" + tables[i], "r") as file:
clang_data = file.readlines()
with open("tables/" + tables[i+1], "r") as file:
gcc_data = file.readlines()
with open(benchmark_name + "-compare.csv", "w") as file:
writer = csv.writer(file)
writer.writerow(csv_headers)
for l in range(1, len(gcc_data)):
gcc_split = gcc_data[l].split('"')
clang_split = clang_data[l].split('"')
target_name = gcc_split[0][:-1]
gcc_instructions = int(gcc_split[1].replace(",", ""))
clang_instructions = int(clang_split[1].replace(",", ""))
instructions_change = calculate_change(
gcc_instructions, clang_instructions)
gcc_code_generation = int(gcc_split[3].replace(",", ""))
clang_code_generation = int(clang_split[3].replace(",", ""))
code_generation_change = calculate_change(
gcc_code_generation, clang_code_generation)
gcc_jit = int(gcc_split[5].replace(",", ""))
clang_jit = int(clang_split[5].replace(",", ""))
jit_change = calculate_change(gcc_jit, clang_jit)
gcc_helpers = int(gcc_split[7].replace(",", ""))
clang_helpers = int(clang_split[7].replace(",", ""))
helpers_change = calculate_change(gcc_helpers, clang_helpers)
with open(benchmark_name + "-compare.csv", "a") as file:
writer = csv.writer(file)
writer.writerow([
target_name,
instructions_change,
code_generation_change,
jit_change,
helpers_change
])
i += 2
matmult_double
Target | Total Instructions % | Code Generation % | JIT Execution % | Helpers % |
---|---|---|---|---|
aarch64 | +42.438% | -5.433% | 0.0% | +83.246% |
alpha | +54.724% | -1.667% | 0.0% | +65.44% |
arm | +0.281% | -2.981% | 0.0% | +1.829% |
hppa | +42.618% | -2.04% | 0.0% | +50.738% |
m68k | +6.483% | -1.075% | 0.0% | +7.974% |
mips | +65.203% | -8.857% | 0.0% | +81.395% |
mipsel | +64.95% | -8.452% | 0.0% | +76.804% |
mips64 | +64.735% | -10.533% | 0.0% | +81.255% |
mips64el | +64.63% | -10.158% | 0.0% | +76.716% |
ppc | +31.499% | -0.455% | 0.0% | +36.406% |
ppc64 | +30.209% | -0.987% | 0.0% | +35.41% |
ppc64le | +30.207% | -1.25% | 0.0% | +35.41% |
riscv64 | +47.186% | -2.543% | 0.0% | +67.549% |
s390x | +43.965% | +1.041% | 0.0% | +50.978% |
sh4 | +51.701% | -3.185% | 0.0% | +64.746% |
sparc64 | +53.417% | -2.615% | 0.0% | +66.767% |
x86_64 | +82.607% | -1.278% | 0.0% | +126.284% |
matmult_int32
Target | Total Instructions % | Code Generation % | JIT Execution % | Helpers % |
---|---|---|---|---|
aarch64 | -0.293% | -5.312% | 0.0% | +9.526% |
alpha | +0.276% | -0.868% | 0.0% | +11.77% |
arm | -1.057% | -2.991% | 0.0% | +0.999% |
hppa | -0.325% | -1.919% | 0.0% | +2.819% |
m68k | +0.39% | -0.745% | 0.0% | +10.763% |
mips | -1.073% | -8.577% | 0.0% | +9.613% |
mipsel | -1.009% | -8.15% | 0.0% | +9.613% |
mips64 | -1.364% | -10.208% | 0.0% | +11.77% |
mips64el | -1.337% | -9.785% | 0.0% | +11.77% |
ppc | +0.315% | +0.336% | 0.0% | +9.799% |
ppc64 | +0.329% | -0.213% | 0.0% | +11.999% |
ppc64le | +0.272% | -0.487% | 0.0% | +11.998% |
riscv64 | +0.198% | -1.848% | 0.0% | +13.465% |
s390x | -2.128% | +1.605% | 0.0% | -17.508% |
sh4 | +0.151% | -2.41% | 0.0% | +7.352% |
sparc64 | -0.458% | -1.431% | 0.0% | -4.12% |
x86_64 | +0.007% | -0.754% | 0.0% | +3.241% |
qsort_double
Target | Total Instructions % | Code Generation % | JIT Execution % | Helpers % |
---|---|---|---|---|
aarch64 | +3.264% | -5.625% | 0.0% | +6.943% |
alpha | +6.213% | -1.94% | 0.0% | +11.956% |
arm | +0.489% | -3.288% | 0.0% | +1.813% |
hppa | -1.408% | -2.463% | 0.0% | -1.974% |
m68k | -6.177% | -1.28% | 0.0% | -8.72% |
mips | +6.788% | -9.03% | 0.0% | +13.956% |
mipsel | +6.803% | -8.667% | 0.0% | +13.956% |
mips64 | +7.867% | -10.688% | 0.0% | +15.063% |
mips64el | +7.895% | -10.311% | 0.0% | +15.063% |
ppc | +19.368% | +4.38% | 0.0% | +34.253% |
ppc64 | +19.121% | +3.863% | 0.0% | +33.155% |
ppc64le | +19.424% | +3.673% | 0.0% | +33.155% |
riscv64 | +4.875% | -2.853% | 0.0% | +8.973% |
s390x | -2.845% | +0.674% | 0.0% | -4.033% |
sh4 | +1.851% | -3.483% | 0.0% | +3.675% |
sparc64 | +6.923% | +14.433% | 0.0% | -4.502% |
x86_64 | +0.078% | -1.572% | 0.0% | +0.236% |
qsort_int32
Target | Total Instructions % | Code Generation % | JIT Execution % | Helpers % |
---|---|---|---|---|
aarch64 | +4.298% | -5.459% | 0.0% | +9.524% |
alpha | +4.87% | -1.32% | 0.0% | +11.765% |
arm | -0.07% | -3.284% | 0.0% | +1.745% |
hppa | +0.731% | -2.412% | 0.0% | +3.335% |
m68k | +4.136% | -0.909% | 0.0% | +9.82% |
mips | +3.305% | -8.721% | 0.0% | +9.615% |
mipsel | +3.321% | -8.292% | 0.0% | +9.615% |
mips64 | +4.103% | -10.349% | 0.0% | +11.5% |
mips64el | +4.223% | -9.95% | 0.0% | +11.591% |
ppc | +3.764% | +5.166% | 0.0% | +9.804% |
ppc64 | +4.285% | +4.647% | 0.0% | +12.0% |
ppc64le | +4.401% | +4.471% | 0.0% | +12.0% |
riscv64 | +6.236% | -2.264% | 0.0% | +13.462% |
s390x | -4.542% | +1.298% | 0.0% | -7.07% |
sh4 | +3.448% | -2.732% | 0.0% | +7.353% |
sparc64 | +8.191% | +14.552% | 0.0% | -10.52% |
x86_64 | +0.476% | -1.048% | 0.0% | +0.974% |
qsort_string
Target | Total Instructions % | Code Generation % | JIT Execution % | Helpers % |
---|---|---|---|---|
aarch64 | +3.645% | -5.364% | 0.0% | +9.524% |
alpha | +4.062% | -1.108% | 0.0% | +11.765% |
arm | -1.39% | -3.386% | 0.0% | +1.436% |
hppa | -0.264% | -2.578% | 0.0% | +3.114% |
m68k | +4.21% | -0.842% | 0.0% | +10.866% |
mips | +2.133% | -8.683% | 0.0% | +9.615% |
mipsel | +2.152% | -8.266% | 0.0% | +9.615% |
mips64 | +3.207% | -10.297% | 0.0% | +11.765% |
mips64el | +3.253% | -9.902% | 0.0% | +11.765% |
ppc | +3.603% | +12.744% | 0.0% | +9.804% |
ppc64 | -0.813% | +12.889% | 0.0% | -7.125% |
ppc64le | -0.832% | +12.807% | 0.0% | -7.134% |
riscv64 | +5.316% | -2.066% | 0.0% | +13.462% |
s390x | -11.648% | +1.513% | 0.0% | -15.236% |
sh4 | +3.012% | -2.647% | 0.0% | +7.353% |
sparc64 | +6.76% | +14.416% | 0.0% | -4.239% |
x86_64 | +0.359% | -0.957% | 0.0% | +0.665% |
Floating Point Benchmarks
For all five benchmarks, most targets had a decrease in the number of instructions spent in code generation, however, there was a major increase in the number of instructions spent in the execution of helpers.
To find out the reason behind this increase, the list_helpers.py
script can be used with the matmul_double
benchmark (which had the biggest decrease in Clang performance) and any of the seventeen targets to list the executed helpers.
List helpers of ppc for matmult_double on GCC:
./list_helpers.py -- <qemu-gcc-build>/ppc-linux-user/qemu-ppc matmult_double-ppc
Results:
Executed QEMU Helpers:
No. Instructions Percentage Calls Ins/Call Helper Name Source File
---- --------------- ---------- --------------- ---------- ------------------------- ------------------------------
1 2,088,642,242 66.832% 8,000,000 261 helper_fmadd <qemu>/target/ppc/fpu_helper.c
2 420,240,000 13.447% 8,240,000 51 helper_compute_fprf_float64 <qemu>/target/ppc/fpu_helper.c
3 139,760,120 4.472% 8,240,008 16 helper_float_check_status <qemu>/target/ppc/fpu_helper.c
4 16,480,024 0.527% 8,240,012 2 helper_reset_fpstatus <qemu>/include/fpu/softfloat-helpers.h
5 11,167,515 0.357% 80,000 139 helper_fmul <qemu>/target/ppc/fpu_helper.c
6 10,320,000 0.330% 80,000 129 helper_fsub <qemu>/target/ppc/fpu_helper.c
7 10,000,000 0.320% 80,000 125 helper_fdiv <qemu>/target/ppc/fpu_helper.c
8 8,314,772 0.266% 162,603 51 helper_lookup_tb_ptr <qemu>/accel/tcg/tcg-runtime.c
9 2,618 0.000% 14 187 helper_dcbz <qemu>/target/ppc/mem_helper.c
10 1,494 0.000% 18 83 helper_raise_exception_err <qemu>/target/ppc/excp_helper.c
11 1,012 0.000% 8 126 helper_fcmpu <qemu>/target/ppc/fpu_helper.c
List helpers of ppc for matmult_double on Clang:
./list_helpers.py -- <qemu-clang-build>/ppc-linux-user/qemu-ppc matmult_double-ppc
Results:
Executed QEMU Helpers:
No. Instructions Percentage Calls Ins/Call Helper Name Source File
---- --------------- ---------- --------------- ---------- ------------------------- ------------------------------
1 3,040,716,864 73.990% 8,000,000 380 helper_fmadd <qemu>/target/ppc/fpu_helper.c
2 403,760,000 9.825% 8,240,000 49 helper_compute_fprf_float64 <qemu>/target/ppc/fpu_helper.c
3 164,480,144 4.002% 8,240,008 19 helper_float_check_status <qemu>/target/ppc/fpu_helper.c
4 18,800,000 0.457% 80,000 235 helper_fsub <qemu>/target/ppc/fpu_helper.c
5 18,230,012 0.444% 80,000 227 helper_fmul <qemu>/target/ppc/fpu_helper.c
6 18,080,000 0.440% 80,000 226 helper_fdiv <qemu>/target/ppc/fpu_helper.c
7 16,480,024 0.401% 8,240,012 2 helper_reset_fpstatus <qemu>/include/fpu/softfloat-helpers.h
8 9,127,473 0.222% 162,603 56 helper_lookup_tb_ptr <qemu>/include/exec/exec-all.h
9 3,774 0.000% 18 209 helper_raise_exception_err <qemu>/target/ppc/excp_helper.c
10 2,492 0.000% 14 178 helper_dcbz <qemu>/target/ppc/mem_helper.c
11 1,544 0.000% 8 193 helper_fcmpu <qemu>/target/ppc/fpu_helper.c
All floating point helpers had an increase in their number of instructions per call. The list_fn_callees.py
script can be utilized to analyse one of these helpers.
List callees of helper_fdiv on GCC:
./list_fn_callees.py -f helper_fdiv-- <qemu-gcc-build>/ppc-linux-user/qemu-ppc matmult_double-ppc
Results:
Callees of helper_fdiv:
No. Instructions Percentage Calls Ins/Call Function Name Source File
---- --------------- ---------- --------------- ---------- ------------------------- ------------------------------
1 7,840,000 0.251% 80,000 98 float64_div <qemu>/include/qemu/bitops.h
List callees of helper_fdiv on Clang:
./list_fn_callees.py -f helper_fdiv-- <qemu-clang-build>/ppc-linux-user/qemu-ppc matmult_double-ppc
Results:
Callees of helper_fdiv:
No. Instructions Percentage Calls Ins/Call Function Name Source File
---- --------------- ---------- --------------- ---------- ------------------------- ------------------------------
1 15,200,000 0.370% 80,000 190 float64_div <qemu>/fpu/softfloat.c
List callees of float64_div on GCC:
./list_fn_callees.py -f float64_div-- <qemu-gcc-build>/ppc-linux-user/qemu-ppc matmult_double-ppc
Results:
Couldn't locate function: float64_div.
List callees of float64_div on Clang:
./list_fn_callees.py -f float64_div-- <qemu-clang-build>/ppc-linux-user/qemu-ppc matmult_double-ppc
Callees of float64_div:
No. Instructions Percentage Calls Ins/Call Function Name Source File
---- --------------- ---------- --------------- ---------- ------------------------- ------------------------------
1 5,760,000 0.140% 80,000 72 round_canonical <qemu>/fpu/softfloat.c
The source code of the float64_div
function which is the callee of helper_fdiv
is attached in the appendix section of the report. Notice how the QEMU_FLATTEN
attribute is used in the function, but it seems that GCC was able to inline all functions within to the maximum depth. Unlike Clang, which failed to do so with float64_round_pack_canonical
causing the round_canonical
function to be explicitly called thus costing extra instructions.
This is the reason behind the increase in the number of instructions per call for all targets and for most of the helpers, which proves that GCC does a better job in optimizing functions with __attribute__((flatten))
. Since this is a compiler optimization problem, it might be fixed in future versions of Clang.
cpu_loop_exit:
By following the same procedure to analyze the helper_raise_exception_err
, it can be seen that cpu_loop_exit
uses a different function for performing long jump in Clang. This function (longjmp@GLIBC_2.2.5
) executes nearly 3 times the number of instruction per call of the function which is used in the GCC build (__longjmp_chk
).
This cpu_loop_exit
behavior also appears in most of the other targets.
Integer and String benchmarks
These types of benchmarks (matmul_int32, qsort_int32 and qsort_string) use very few helpers compared to benchmarks involving floating point numbers. The change in performance is mainly attributed to code generation. Degradation in helpers still occurs and one of its reasons is the different long jump function used in the two builds which was discussed above.
QEMU Binary Size
To conclude the report, a small Python script is used to measure the size of the QEMU target binary for each of GCC and Clang. The last column in the output table is the percentage of change in the Clang binary size compared to GCC.
import os
import csv
def convert_bytes(n):
for x in ['bytes', 'KB', 'MB', 'GB', 'TB']:
if n < 1024.0:
return "%3.1f %s" % (n, x)
n /= 1024.0
builds = {
"gcc": "<qemu-gcc-build>",
"clang": "<qemu-clang-build>"
}
targets = ["aarch64", "alpha", "arm", "hppa", "m68k", "mips", "mipsel",
"mips64", "mips64el", "ppc", "ppc64", "ppc64le", "riscv64",
"s390x", "sh4", "sparc64", "x86_64"]
csv_headers = ["Target", "GCC", "Clang", "Difference %"]
with open("compare_exe.csv", "w") as file:
writer = csv.writer(file)
writer.writerow(csv_headers)
for target in targets:
size = []
for build_name, build_path in builds.items():
size.append(os.path.getsize("{}/{}-linux-user/qemu-{}".format(build_path,
target,
target)))
with open("compare_exe.csv", "a") as file:
writer = csv.writer(file)
writer.writerow([target, convert_bytes(size[0]), convert_bytes(
size[1]), str(round(((size[1] - size[0]) / size[1]) * 100, 3))+"%"])
Results
Target | GCC | Clang | Difference % |
---|---|---|---|
aarch64 | 27.6 MB | 26.4 MB | -4.722% |
alpha | 15.1 MB | 11.9 MB | -26.789% |
arm | 20.1 MB | 17.7 MB | -13.108% |
hppa | 15.4 MB | 12.4 MB | -24.662% |
m68k | 16.1 MB | 12.9 MB | -24.916% |
mips | 25.5 MB | 23.7 MB | -7.304% |
mipsel | 25.3 MB | 23.7 MB | -6.577% |
mips64 | 25.8 MB | 24.2 MB | -6.743% |
mips64el | 25.7 MB | 24.2 MB | -6.067% |
ppc | 20.1 MB | 19.2 MB | -4.539% |
ppc64 | 20.4 MB | 19.7 MB | -3.337% |
ppc64le | 20.2 MB | 19.7 MB | -2.52% |
riscv64 | 20.1 MB | 18.1 MB | -10.943% |
s390x | 17.2 MB | 13.7 MB | -25.374% |
sh4 | 15.0 MB | 11.8 MB | -27.197% |
sparc64 | 15.8 MB | 12.7 MB | -24.964% |
x86_64 | 17.2 MB | 13.9 MB | -23.38% |
Appendix
float64_div Implementation
float64 QEMU_FLATTEN
float64_div(float64 a, float64 b, float_status *s)
{
return float64_gen2(a, b, s, hard_f64_div, soft_f64_div,
f64_div_pre, f64_div_post);
}
static inline float64
float64_gen2(float64 xa, float64 xb, float_status *s,
hard_f64_op2_fn hard, soft_f64_op2_fn soft,
f64_check_fn pre, f64_check_fn post)
{
union_float64 ua, ub, ur;
ua.s = xa;
ub.s = xb;
if (unlikely(!can_use_fpu(s))) {
goto soft;
}
float64_input_flush2(&ua.s, &ub.s, s);
if (unlikely(!pre(ua, ub))) {
goto soft;
}
ur.h = hard(ua.h, ub.h);
if (unlikely(f64_is_inf(ur))) {
s->float_exception_flags |= float_flag_overflow;
} else if (unlikely(fabs(ur.h) <= DBL_MIN) && post(ua, ub)) {
goto soft;
}
return ur.s;
soft:
return soft(ua.s, ub.s, s);
}
static float64 QEMU_SOFTFLOAT_ATTR
soft_f64_div(float64 a, float64 b, float_status *status)
{
FloatParts pa = float64_unpack_canonical(a, status);
FloatParts pb = float64_unpack_canonical(b, status);
FloatParts pr = div_floats(pa, pb, status);
return float64_round_pack_canonical(pr, status);
}
static float64 float64_round_pack_canonical(FloatParts p, float_status *s)
{
return float64_pack_raw(round_canonical(p, s, &float64_params));
}