Performance Comparison of Two QEMU Builds (Compilers and QEMU Performance)

GCC VS Clang

Ahmed Karaman - July 27, 2020

Intro

This report presents a performance comparison between two different QEMU builds, GCC and Clang. To provide a variety of test workloads, five new benchmarks are also introduced in the report. For each benchmark, the performance of QEMU is compared across the two builds for seventeen different targets.

The compiler versions used in the report are the default available versions in the Ubuntu 18.0 repositories which are 7.5.0 for GCC and 6.0.0 for Clang.

Table of Contents

Benchmarks Overview

This section gives a quick overview of the five new benchmarks used in the report. All benchmarks are available on the project GitHub page.

matmult_double:

Standard matrix multiplication of an n*n matrix of randomly generated double numbers from 0 to 100. The value of n is passed as an argument with the -n flag. The default value is 200.

matmult_int32:

Standard matrix multiplication of an n*n matrix of randomly generated integer numbers from 0 to 100. The value of n is passed as an argument with the -n flag. The default value is 200.

qsort_double:

Quick sort of an array of n randomly generated double numbers from 0 to 1000. The value of n is passed as an argument with the -n flag. The default value is 300000.

qsort_int32:

Quick sort of an array of n randomly generated integer numbers from 0 to 50000000. The value of n is passed as an argument with the -n flag. The default value is 300000.

qsort_string:

Quick sort of an array of 10000 randomly generated strings of size 8 (including null terminating character). The sort process is repeated n number of times. The value of n is passed as an argument with the -n flag. The default value is 20.

Setup and Prerequisites

All previous reports assumed a GCC build on top of QEMU 5.0.0. Given that QEMU 5.1.0-rc1 is now released, the report will be based on this newer QEMU version.

To download and make the GCC and Clang builds for QEMU, you can run the bash snippet below:

wget https://download.qemu.org/qemu-5.1.0-rc1.tar.xz
tar xfv qemu-5.1.0-rc1.tar.xz
cd qemu-5.1.0-rc1
mkdir build-gcc
cd build-gcc
../configure
make
cd ..
mkdir build-clang
../configure --cc=clang --cxx=clang++
make

The report will assume that the GCC build is at <qemu-gcc-build> and Clang build is at <qemu-clang-build>.

To measure the performance of the seventeen targets for all of the five benchmarks and the two QEMU builds, the Python script below is used. The script runs dissect.py (which was modified to print the number of instructions instead of percentages) on each of the five benchmarks for the two different builds. The results are ten tables, two for each benchmark.

The script assumes that all five benchmarks are in a benchmarks directory.

import csv
import os
import subprocess


############### Script Options ###############
builds = {
    "gcc": "<qemu-gcc-build>",
    "clang": "<qemu-clang-build>"
}
targets = {
    "aarch64":  "aarch64-linux-gnu-gcc",
    "alpha":    "alpha-linux-gnu-gcc",
    "arm":      "arm-linux-gnueabi-gcc",
    "hppa":     "hppa-linux-gnu-gcc",
    "m68k":     "m68k-linux-gnu-gcc",
    "mips":     "mips-linux-gnu-gcc",
    "mipsel":   "mipsel-linux-gnu-gcc",
    "mips64":   "mips64-linux-gnuabi64-gcc",
    "mips64el": "mips64el-linux-gnuabi64-gcc",
    "ppc":      "powerpc-linux-gnu-gcc",
    "ppc64":    "powerpc64-linux-gnu-gcc",
    "ppc64le":  "powerpc64le-linux-gnu-gcc",
    "riscv64":  "riscv64-linux-gnu-gcc",
    "s390x":    "s390x-linux-gnu-gcc",
    "sh4":      "sh4-linux-gnu-gcc",
    "sparc64":  "sparc64-linux-gnu-gcc",
    "x86_64":   "gcc"
}
##############################################

benchmarks = sorted(os.listdir("benchmarks"))

csv_headers = ["Target", "Total Instructions",
               "Code Generation", "JIT Execution", "Helpers"]


for benchmark in benchmarks:
    # Remove file extension
    benchmark_name = os.path.splitext(benchmark)[0]

    for build_name, _ in builds.items():
        with open("tables/{}-{}.csv".format(benchmark_name, build_name), "w") as csv_file:
            writer = csv.writer(csv_file)
            writer.writerow(csv_headers)

    for target_name, target_compiler in targets.items():
        compile_target = subprocess.run([target_compiler,
                                         "-O2",
                                         "-static",
                                         "benchmarks/"+benchmark,
                                         "-o",
                                         "/tmp/" + benchmark_name])

        for build_name, build_path in builds.items():
            dissect_target = subprocess.run(["./dissect.py",
                                             "--",
                                             "{}/{}-linux-user/qemu-{}".
                                             format(build_path,
                                                    target_name,
                                                    target_name),
                                             "/tmp/" + benchmark_name],
                                            stdout=subprocess.PIPE)
            # Read the dissect output
            lines = dissect_target.stdout.decode("utf-8").split('\n')

            # Extract measurements
            total_instructions = lines[0].split()[-1]
            code_generation = lines[2].split()[-2]
            jit_execution = lines[3].split()[-2]
            helpers_execution = lines[4].split()[-2]

            # Save output to CSV
            with open("tables/{}-{}.csv".format(benchmark_name, build_name), "a") as csv_file:
                writer = csv.writer(csv_file)
                writer.writerow([target_name,
                                 total_instructions,
                                 code_generation,
                                 jit_execution,
                                 helpers_execution])
    # Remove temporary file
    os.unlink("/tmp/" + benchmark_name)

The ten resulting tables are shown in the next section. For Clang tables, numbers in green indicate a decrease in the number of instructions, and numbers in red indicate otherwise.

Performance Tables

matmult_double (GCC)

Target Total Instructions Code Generation JIT Execution Helpers
aarch64 1 411 957 967 76 120 077 611 067 999 724 769 891
alpha 3 020 752 954 56 143 853 437 106 648 2 527 502 453
arm 8 721 987 230 285 174 602 6 633 909 955 1 802 902 673
hppa 3 345 168 851 179 176 839 348 961 284 2 817 030 728
m68k 3 327 223 458 65 872 033 547 273 911 2 714 077 514
mips 2 262 859 230 79 312 482 362 219 758 1 821 326 990
mipsel 3 176 135 194 79 192 739 402 266 174 2 694 676 281
mips64 2 276 881 008 87 067 585 364 561 337 1 825 252 086
mips64el 3 189 604 541 86 891 148 404 111 884 2 698 601 509
ppc 3 125 097 209 82 035 165 338 132 356 2 704 929 688
ppc64 3 202 929 198 88 654 395 379 358 099 2 734 916 704
ppc64le 3 202 302 439 88 841 776 378 540 677 2 734 919 986
riscv64 1 222 310 471 60 743 087 305 444 092 856 123 292
s390x 2 726 250 005 57 481 812 318 742 202 2 350 025 991
sh4 3 341 872 364 67 475 044 602 524 473 2 671 872 847
sparc64 3 205 825 118 134 723 352 501 019 705 2 570 082 061
x86_64 1 249 941 832 67 613 673 364 007 574 818 320 585

matmult_double (Clang)

Target Total Instructions Code Generation JIT Execution Helpers
aarch64 2 011 166 477 71 984 566 611 067 999 1 328 113 912
alpha 4 673 820 741 55 207 732 437 106 648 4 181 506 361
arm 8 746 454 169 276 672 219 6 633 909 955 1 835 871 995
hppa 4 770 815 514 175 521 902 348 961 284 4 246 332 328
m68k 3 542 927 301 65 164 233 547 273 911 2 930 489 157
mips 3 738 305 471 72 287 768 362 219 758 3 303 797 945
mipsel 5 239 048 897 72 499 086 402 266 174 4 764 283 637
mips64 3 750 825 939 77 896 420 364 561 337 3 308 368 182
mips64el 5 251 030 988 78 065 089 404 111 884 4 768 854 015
ppc 4 109 476 668 81 662 092 338 132 356 3 689 682 220
ppc64 4 170 489 431 87 779 750 379 358 099 3 703 351 582
ppc64le 4 169 627 378 87 731 465 378 540 677 3 703 355 236
riscv64 1 799 067 648 59 198 155 305 444 092 1 434 425 401
s390x 3 924 840 571 58 080 261 318 742 202 3 548 018 108
sh4 5 069 649 275 65 326 027 602 524 473 4 401 798 775
sparc64 4 918 273 993 131 200 185 501 019 705 4 286 054 103
x86_64 2 282 484 944 66 749 639 364 007 574 1 851 727 731



matmult_int32 (GCC)

Target Total Instructions Code Generation JIT Execution Helpers
aarch64 595 494 162 69 380 056 505 728 486 20 385 620
alpha 370 047 513 50 451 964 307 209 296 12 386 253
arm 735 549 496 276 195 690 410 431 931 48 921 875
hppa 666 639 906 171 974 055 454 423 118 40 242 733
m68k 406 939 070 59 081 632 329 037 444 18 819 994
mips 497 096 141 71 679 287 416 946 656 8 470 198
mipsel 497 010 030 71 506 947 417 032 765 8 470 318
mips64 478 992 426 78 298 258 388 302 800 12 391 368
mips64el 462 357 562 78 074 494 371 890 705 12 392 363
ppc 338 417 174 74 914 476 255 198 622 8 304 076
ppc64 390 129 095 80 972 586 297 023 711 12 132 798
ppc64le 390 053 461 80 990 281 296 928 184 12 134 996
riscv64 349 030 315 54 578 504 281 826 149 12 625 662
s390x 491 822 152 51 853 248 375 436 514 64 532 390
sh4 399 132 791 59 096 149 312 448 882 27 587 760
sparc64 488 710 835 123 407 698 353 794 723 11 508 414
x86_64 399 168 070 61 136 986 322 978 674 15 052 410

matmult_int32 (Clang)

Target Total Instructions Code Generation JIT Execution Helpers
aarch64 593 751 025 65 694 887 505 728 486 22 327 652
alpha 371 067 721 50 014 259 307 209 296 13 844 166
arm 727 777 546 267 935 229 410 431 931 49 410 386
hppa 664 473 385 168 673 213 454 423 118 41 377 054
m68k 408 524 487 58 641 545 329 037 444 20 845 498
mips 491 762 457 65 531 396 416 946 656 9 284 405
mipsel 491 996 655 65 679 356 417 032 765 9 284 534
mips64 472 458 134 70 305 442 388 302 800 13 849 892
mips64el 456 176 576 70 434 870 371 890 705 13 851 001
ppc 339 482 792 75 166 396 255 198 622 9 117 774
ppc64 391 412 561 80 800 275 297 023 711 13 588 575
ppc64le 391 115 113 80 595 928 296 928 184 13 591 001
riscv64 349 721 940 53 570 099 281 826 149 14 325 692
s390x 481 356 004 52 685 433 375 436 514 53 234 057
sh4 399 736 614 57 671 842 312 448 882 29 615 890
sparc64 486 470 314 121 641 278 353 794 723 11 034 313
x86_64 399 194 875 60 675 932 322 978 674 15 540 269



qsort_double (GCC)

Target Total Instructions Code Generation JIT Execution Helpers
aarch64 2 658 154 250 79 040 798 1 265 579 424 1 313 534 028
alpha 1 949 114 474 57 399 955 869 643 481 1 022 071 038
arm 9 118 694 070 850 895 346 4 265 464 995 4 002 333 729
hppa 3 138 372 515 496 702 308 1 023 062 954 1 618 607 253
m68k 4 385 213 371 63 926 269 1 224 197 193 3 097 089 909
mips 2 098 316 452 82 146 339 942 372 869 1 073 797 244
mipsel 2 098 510 236 81 980 330 942 732 894 1 073 797 012
mips64 1 970 640 902 90 221 912 787 224 479 1 093 194 511
mips64el 1 968 095 838 90 070 316 784 830 991 1 093 194 531
ppc 2 735 890 533 115 892 326 1 087 846 009 1 532 152 198
ppc64 2 684 919 199 122 371 298 1 028 369 561 1 534 178 340
ppc64le 2 641 863 052 122 545 885 985 137 467 1 534 179 700
riscv64 1 589 964 563 62 644 918 643 559 932 883 759 713
s390x 2 474 989 116 58 933 857 660 401 279 1 755 653 980
sh4 2 562 375 399 68 574 230 1 138 226 176 1 355 574 993
sparc64 3 917 963 038 2 055 155 359 1 298 625 002 564 182 677
x86_64 1 986 765 860 68 884 527 804 594 605 1 113 286 728

qsort_double (Clang)

Target Total Instructions Code Generation JIT Execution Helpers
aarch64 2 744 907 796 74 594 511 1 265 579 424 1 404 733 861
alpha 2 070 204 037 56 286 633 869 643 481 1 144 273 923
arm 9 163 293 622 822 921 830 4 265 464 995 4 074 906 797
hppa 3 094 186 179 484 467 463 1 023 062 954 1 586 655 762
m68k 4 114 339 886 63 107 725 1 224 197 193 2 827 034 968
mips 2 240 757 989 74 728 757 942 372 869 1 223 656 363
mipsel 2 241 264 168 74 875 161 942 732 894 1 223 656 113
mips64 2 125 667 762 80 578 997 787 224 479 1 257 864 286
mips64el 2 123 478 152 80 782 855 784 830 991 1 257 864 306
ppc 3 265 774 088 120 968 560 1 087 846 009 2 056 959 519
ppc64 3 198 309 630 127 098 740 1 028 369 561 2 042 841 329
ppc64le 3 155 027 150 127 046 877 985 137 467 2 042 842 806
riscv64 1 667 477 270 60 857 834 643 559 932 963 059 504
s390x 2 404 573 110 59 331 077 660 401 279 1 684 840 754
sh4 2 609 811 199 66 186 107 1 138 226 176 1 405 398 916
sparc64 4 189 185 205 2 351 778 131 1 298 625 002 538 782 072
x86_64 1 988 312 647 67 801 893 804 594 605 1 115 916 149



qsort_int32 (GCC)

Target Total Instructions Code Generation JIT Execution Helpers
aarch64 2 131 720 142 72 399 376 1 055 781 197 1 003 539 569
alpha 1 460 366 380 52 934 795 796 943 218 610 488 367
arm 3 372 434 621 843 360 743 1 078 158 662 1 450 915 216
hppa 2 196 495 498 491 615 054 868 099 497 836 780 947
m68k 1 779 675 824 58 332 907 966 478 982 754 863 935
mips 1 499 858 843 74 197 186 842 808 363 582 853 294
mipsel 1 502 516 358 74 066 097 845 597 211 582 853 050
mips64 1 498 104 595 84 972 133 802 161 902 610 970 560
mips64el 1 477 116 988 84 679 402 781 628 546 610 809 040
ppc 1 668 038 700 109 657 874 975 751 506 582 629 320
ppc64 1 779 152 045 115 936 847 1 072 790 643 590 424 555
ppc64le 1 727 703 061 115 826 109 1 021 451 041 590 425 911
riscv64 1 289 198 318 57 502 772 624 840 792 606 854 754
s390x 2 114 306 901 53 466 647 692 707 638 1 368 132 616
sh4 1 878 429 484 61 060 371 913 781 294 903 587 819
sparc64 3 352 057 480 2 022 774 129 1 141 078 790 188 204 561
x86_64 1 751 081 171 63 007 727 765 175 073 922 898 371

qsort_int32 (Clang)

Target Total Instructions Code Generation JIT Execution Helpers
aarch64 2 223 343 530 68 447 052 1 055 781 197 1 099 115 281
alpha 1 531 490 101 52 235 802 796 943 218 682 311 081
arm 3 370 059 365 815 660 694 1 078 158 662 1 476 240 009
hppa 2 212 542 138 479 758 595 868 099 497 864 684 046
m68k 1 853 277 157 57 802 833 966 478 982 828 995 342
mips 1 549 431 653 67 726 753 842 808 363 638 896 537
mipsel 1 552 417 791 67 924 305 845 597 211 638 896 275
mips64 1 559 572 362 76 178 736 802 161 902 681 231 724
mips64el 1 539 491 325 76 254 175 781 628 546 681 608 604
ppc 1 730 823 937 115 323 201 975 751 506 639 749 230
ppc64 1 855 390 899 121 324 924 1 072 790 643 661 275 332
ppc64le 1 803 732 493 121 004 644 1 021 451 041 661 276 808
riscv64 1 369 588 518 56 200 727 624 840 792 688 546 999
s390x 2 018 273 542 54 160 605 692 707 638 1 271 405 299
sh4 1 943 201 025 59 392 201 913 781 294 970 027 530
sparc64 3 626 612 511 2 317 128 970 1 141 078 790 168 404 751
x86_64 1 759 411 677 62 347 319 765 175 073 931 889 285



qsort_string (GCC)

Target Total Instructions Code Generation JIT Execution Helpers
aarch64 2 530 266 115 71 878 428 1 449 639 434 1 008 748 253
alpha 1 793 598 941 52 114 495 1 117 230 092 624 254 354
arm 7 155 712 165 3 673 959 444 1 747 578 444 1 734 174 277
hppa 4 596 662 435 2 077 290 104 1 188 975 849 1 330 396 482
m68k 2 295 149 555 58 023 587 1 343 485 580 893 640 388
mips 2 113 525 462 74 968 779 1 501 957 602 536 599 081
mipsel 2 110 264 854 74 820 076 1 498 845 813 536 598 965
mips64 1 968 388 319 81 540 025 1 278 967 522 607 880 772
mips64el 1 950 703 742 81 332 799 1 261 490 179 607 880 764
ppc 2 428 873 715 269 273 748 1 616 960 349 542 639 618
ppc64 2 404 019 471 273 403 312 1 361 586 990 769 029 169
ppc64le 2 385 236 674 273 482 576 1 342 609 438 769 144 660
riscv64 1 563 526 901 56 535 790 880 912 912 626 078 199
s390x 3 934 202 536 52 728 806 868 450 778 3 013 022 952
sh4 2 097 991 097 61 043 883 1 155 615 736 881 331 478
sparc64 4 130 814 212 2 078 206 607 1 572 858 282 479 749 323
x86_64 2 864 486 422 63 565 441 1 160 432 349 1 640 488 632

qsort_string (Clang)

Target Total Instructions Code Generation JIT Execution Helpers
aarch64 2 622 482 230 68 022 754 1 449 639 434 1 104 820 042
alpha 1 866 463 476 51 536 810 1 117 230 092 697 696 574
arm 7 056 218 429 3 549 558 538 1 747 578 444 1 759 081 447
hppa 4 584 536 797 2 023 729 867 1 188 975 849 1 371 831 081
m68k 2 391 766 444 57 534 808 1 343 485 580 990 746 056
mips 2 158 611 852 68 459 450 1 501 957 602 588 194 800
mipsel 2 155 675 997 68 635 509 1 498 845 813 588 194 675
mips64 2 031 507 933 73 143 579 1 278 967 522 679 396 832
mips64el 2 014 166 270 73 279 267 1 261 490 179 679 396 824
ppc 2 516 388 621 303 589 290 1 616 960 349 595 838 982
ppc64 2 384 468 962 308 642 876 1 361 586 990 714 239 096
ppc64le 2 365 387 612 308 506 861 1 342 609 438 714 271 313
riscv64 1 646 638 770 55 367 615 880 912 912 710 358 243
s390x 3 475 927 973 53 526 410 868 450 778 2 553 950 785
sh4 2 161 178 295 59 427 892 1 155 615 736 946 134 667
sparc64 4 410 070 085 2 377 799 055 1 572 858 282 459 412 748
x86_64 2 874 783 326 62 956 885 1 160 432 349 1 651 394 092



Analysis of Results

Comparison Script

To facilitate the analysis, another Python script can be used to compare Clang performance to that of GCC for each benchmark. The result would be five tables (one for each benchmark) with the percentage of increase/decrease in Clang instructions compared to GCC.

The script assumes that all the tables from the previous section are in a tables directory.

import os
import csv


def calculate_change(gcc_instructions, clang_instructions):
    # Calculate the percentage of change in Clang instructions compared to GCC
    percentage = round(((clang_instructions - gcc_instructions) /
                        gcc_instructions) * 100, 3)
    return "+" + str(percentage) + "%" if percentage > 0 else str(percentage) + "%"


tables = sorted(os.listdir("tables"))

csv_headers = ["Target", "Total Instructions %",
               "Code Generation %", "JIT Execution %", "Helpers %"]

for i in range(0, len(tables), 2):
    benchmark_name = tables[i].split("-")[0]

    # Extract data from tables
    clang_data, gcc_data = [], []
    with open("tables/" + tables[i], "r") as file:
        clang_data = file.readlines()
    with open("tables/" + tables[i+1], "r") as file:
        gcc_data = file.readlines()

    with open(benchmark_name + "-compare.csv", "w") as file:
        writer = csv.writer(file)
        writer.writerow(csv_headers)

    for l in range(1, len(gcc_data)):
        gcc_split = gcc_data[l].split('"')
        clang_split = clang_data[l].split('"')

        target_name = gcc_split[0][:-1]

        gcc_instructions = int(gcc_split[1].replace(",", ""))
        clang_instructions = int(clang_split[1].replace(",", ""))
        instructions_change = calculate_change(
            gcc_instructions, clang_instructions)

        gcc_code_generation = int(gcc_split[3].replace(",", ""))
        clang_code_generation = int(clang_split[3].replace(",", ""))
        code_generation_change = calculate_change(
            gcc_code_generation, clang_code_generation)

        gcc_jit = int(gcc_split[5].replace(",", ""))
        clang_jit = int(clang_split[5].replace(",", ""))
        jit_change = calculate_change(gcc_jit, clang_jit)

        gcc_helpers = int(gcc_split[7].replace(",", ""))
        clang_helpers = int(clang_split[7].replace(",", ""))
        helpers_change = calculate_change(gcc_helpers, clang_helpers)

        with open(benchmark_name + "-compare.csv", "a") as file:
            writer = csv.writer(file)
            writer.writerow([
                target_name,
                instructions_change,
                code_generation_change,
                jit_change,
                helpers_change
            ])
    i += 2

matmult_double

Target Total Instructions % Code Generation % JIT Execution % Helpers %
aarch64 +42.438% -5.433% 0.0% +83.246%
alpha +54.724% -1.667% 0.0% +65.44%
arm +0.281% -2.981% 0.0% +1.829%
hppa +42.618% -2.04% 0.0% +50.738%
m68k +6.483% -1.075% 0.0% +7.974%
mips +65.203% -8.857% 0.0% +81.395%
mipsel +64.95% -8.452% 0.0% +76.804%
mips64 +64.735% -10.533% 0.0% +81.255%
mips64el +64.63% -10.158% 0.0% +76.716%
ppc +31.499% -0.455% 0.0% +36.406%
ppc64 +30.209% -0.987% 0.0% +35.41%
ppc64le +30.207% -1.25% 0.0% +35.41%
riscv64 +47.186% -2.543% 0.0% +67.549%
s390x +43.965% +1.041% 0.0% +50.978%
sh4 +51.701% -3.185% 0.0% +64.746%
sparc64 +53.417% -2.615% 0.0% +66.767%
x86_64 +82.607% -1.278% 0.0% +126.284%

matmult_int32

Target Total Instructions % Code Generation % JIT Execution % Helpers %
aarch64 -0.293% -5.312% 0.0% +9.526%
alpha +0.276% -0.868% 0.0% +11.77%
arm -1.057% -2.991% 0.0% +0.999%
hppa -0.325% -1.919% 0.0% +2.819%
m68k +0.39% -0.745% 0.0% +10.763%
mips -1.073% -8.577% 0.0% +9.613%
mipsel -1.009% -8.15% 0.0% +9.613%
mips64 -1.364% -10.208% 0.0% +11.77%
mips64el -1.337% -9.785% 0.0% +11.77%
ppc +0.315% +0.336% 0.0% +9.799%
ppc64 +0.329% -0.213% 0.0% +11.999%
ppc64le +0.272% -0.487% 0.0% +11.998%
riscv64 +0.198% -1.848% 0.0% +13.465%
s390x -2.128% +1.605% 0.0% -17.508%
sh4 +0.151% -2.41% 0.0% +7.352%
sparc64 -0.458% -1.431% 0.0% -4.12%
x86_64 +0.007% -0.754% 0.0% +3.241%

qsort_double

Target Total Instructions % Code Generation % JIT Execution % Helpers %
aarch64 +3.264% -5.625% 0.0% +6.943%
alpha +6.213% -1.94% 0.0% +11.956%
arm +0.489% -3.288% 0.0% +1.813%
hppa -1.408% -2.463% 0.0% -1.974%
m68k -6.177% -1.28% 0.0% -8.72%
mips +6.788% -9.03% 0.0% +13.956%
mipsel +6.803% -8.667% 0.0% +13.956%
mips64 +7.867% -10.688% 0.0% +15.063%
mips64el +7.895% -10.311% 0.0% +15.063%
ppc +19.368% +4.38% 0.0% +34.253%
ppc64 +19.121% +3.863% 0.0% +33.155%
ppc64le +19.424% +3.673% 0.0% +33.155%
riscv64 +4.875% -2.853% 0.0% +8.973%
s390x -2.845% +0.674% 0.0% -4.033%
sh4 +1.851% -3.483% 0.0% +3.675%
sparc64 +6.923% +14.433% 0.0% -4.502%
x86_64 +0.078% -1.572% 0.0% +0.236%

qsort_int32

Target Total Instructions % Code Generation % JIT Execution % Helpers %
aarch64 +4.298% -5.459% 0.0% +9.524%
alpha +4.87% -1.32% 0.0% +11.765%
arm -0.07% -3.284% 0.0% +1.745%
hppa +0.731% -2.412% 0.0% +3.335%
m68k +4.136% -0.909% 0.0% +9.82%
mips +3.305% -8.721% 0.0% +9.615%
mipsel +3.321% -8.292% 0.0% +9.615%
mips64 +4.103% -10.349% 0.0% +11.5%
mips64el +4.223% -9.95% 0.0% +11.591%
ppc +3.764% +5.166% 0.0% +9.804%
ppc64 +4.285% +4.647% 0.0% +12.0%
ppc64le +4.401% +4.471% 0.0% +12.0%
riscv64 +6.236% -2.264% 0.0% +13.462%
s390x -4.542% +1.298% 0.0% -7.07%
sh4 +3.448% -2.732% 0.0% +7.353%
sparc64 +8.191% +14.552% 0.0% -10.52%
x86_64 +0.476% -1.048% 0.0% +0.974%

qsort_string

Target Total Instructions % Code Generation % JIT Execution % Helpers %
aarch64 +3.645% -5.364% 0.0% +9.524%
alpha +4.062% -1.108% 0.0% +11.765%
arm -1.39% -3.386% 0.0% +1.436%
hppa -0.264% -2.578% 0.0% +3.114%
m68k +4.21% -0.842% 0.0% +10.866%
mips +2.133% -8.683% 0.0% +9.615%
mipsel +2.152% -8.266% 0.0% +9.615%
mips64 +3.207% -10.297% 0.0% +11.765%
mips64el +3.253% -9.902% 0.0% +11.765%
ppc +3.603% +12.744% 0.0% +9.804%
ppc64 -0.813% +12.889% 0.0% -7.125%
ppc64le -0.832% +12.807% 0.0% -7.134%
riscv64 +5.316% -2.066% 0.0% +13.462%
s390x -11.648% +1.513% 0.0% -15.236%
sh4 +3.012% -2.647% 0.0% +7.353%
sparc64 +6.76% +14.416% 0.0% -4.239%
x86_64 +0.359% -0.957% 0.0% +0.665%

Floating Point Benchmarks

For all five benchmarks, most targets had a decrease in the number of instructions spent in code generation, however, there was a major increase in the number of instructions spent in the execution of helpers.

To find out the reason behind this increase, the list_helpers.py script can be used with the matmul_double benchmark (which had the biggest decrease in Clang performance) and any of the seventeen targets to list the executed helpers.

List helpers of ppc for matmult_double on GCC:

./list_helpers.py -- <qemu-gcc-build>/ppc-linux-user/qemu-ppc matmult_double-ppc

Results:

Executed QEMU Helpers:

 No.     Instructions  Percentage            Calls    Ins/Call  Helper Name                   Source File
----  ---------------  ----------  ---------------  ----------  -------------------------     ------------------------------
   1    2,088,642,242     66.832%        8,000,000         261  helper_fmadd                  <qemu>/target/ppc/fpu_helper.c
   2      420,240,000     13.447%        8,240,000          51  helper_compute_fprf_float64   <qemu>/target/ppc/fpu_helper.c
   3      139,760,120      4.472%        8,240,008          16  helper_float_check_status     <qemu>/target/ppc/fpu_helper.c
   4       16,480,024      0.527%        8,240,012           2  helper_reset_fpstatus         <qemu>/include/fpu/softfloat-helpers.h
   5       11,167,515      0.357%           80,000         139  helper_fmul                   <qemu>/target/ppc/fpu_helper.c
   6       10,320,000      0.330%           80,000         129  helper_fsub                   <qemu>/target/ppc/fpu_helper.c
   7       10,000,000      0.320%           80,000         125  helper_fdiv                   <qemu>/target/ppc/fpu_helper.c
   8        8,314,772      0.266%          162,603          51  helper_lookup_tb_ptr          <qemu>/accel/tcg/tcg-runtime.c
   9            2,618      0.000%               14         187  helper_dcbz                   <qemu>/target/ppc/mem_helper.c
  10            1,494      0.000%               18          83  helper_raise_exception_err    <qemu>/target/ppc/excp_helper.c
  11            1,012      0.000%                8         126  helper_fcmpu                  <qemu>/target/ppc/fpu_helper.c

List helpers of ppc for matmult_double on Clang:

./list_helpers.py -- <qemu-clang-build>/ppc-linux-user/qemu-ppc matmult_double-ppc

Results:

Executed QEMU Helpers:

 No.     Instructions  Percentage            Calls    Ins/Call  Helper Name                   Source File
----  ---------------  ----------  ---------------  ----------  -------------------------     ------------------------------
   1    3,040,716,864     73.990%        8,000,000         380  helper_fmadd                  <qemu>/target/ppc/fpu_helper.c
   2      403,760,000      9.825%        8,240,000          49  helper_compute_fprf_float64   <qemu>/target/ppc/fpu_helper.c
   3      164,480,144      4.002%        8,240,008          19  helper_float_check_status     <qemu>/target/ppc/fpu_helper.c
   4       18,800,000      0.457%           80,000         235  helper_fsub                   <qemu>/target/ppc/fpu_helper.c
   5       18,230,012      0.444%           80,000         227  helper_fmul                   <qemu>/target/ppc/fpu_helper.c
   6       18,080,000      0.440%           80,000         226  helper_fdiv                   <qemu>/target/ppc/fpu_helper.c
   7       16,480,024      0.401%        8,240,012           2  helper_reset_fpstatus         <qemu>/include/fpu/softfloat-helpers.h
   8        9,127,473      0.222%          162,603          56  helper_lookup_tb_ptr          <qemu>/include/exec/exec-all.h
   9            3,774      0.000%               18         209  helper_raise_exception_err    <qemu>/target/ppc/excp_helper.c
  10            2,492      0.000%               14         178  helper_dcbz                   <qemu>/target/ppc/mem_helper.c
  11            1,544      0.000%                8         193  helper_fcmpu                  <qemu>/target/ppc/fpu_helper.c

All floating point helpers had an increase in their number of instructions per call. The list_fn_callees.py script can be utilized to analyse one of these helpers.

List callees of helper_fdiv on GCC:

./list_fn_callees.py -f helper_fdiv-- <qemu-gcc-build>/ppc-linux-user/qemu-ppc matmult_double-ppc

Results:

Callees of helper_fdiv:

 No.     Instructions  Percentage            Calls    Ins/Call  Function Name              Source File
----  ---------------  ----------  ---------------  ----------  -------------------------  ------------------------------
   1        7,840,000      0.251%           80,000          98  float64_div                 <qemu>/include/qemu/bitops.h

List callees of helper_fdiv on Clang:

./list_fn_callees.py -f helper_fdiv-- <qemu-clang-build>/ppc-linux-user/qemu-ppc matmult_double-ppc

Results:

Callees of helper_fdiv:

 No.     Instructions  Percentage            Calls    Ins/Call  Function Name              Source File
----  ---------------  ----------  ---------------  ----------  -------------------------  ------------------------------
   1       15,200,000      0.370%           80,000         190  float64_div                 <qemu>/fpu/softfloat.c

List callees of float64_div on GCC:

./list_fn_callees.py -f float64_div-- <qemu-gcc-build>/ppc-linux-user/qemu-ppc matmult_double-ppc

Results:

Couldn't locate function: float64_div.

List callees of float64_div on Clang:

./list_fn_callees.py -f float64_div-- <qemu-clang-build>/ppc-linux-user/qemu-ppc matmult_double-ppc
Callees of float64_div:

 No.     Instructions  Percentage            Calls    Ins/Call  Function Name              Source File
----  ---------------  ----------  ---------------  ----------  -------------------------  ------------------------------
   1        5,760,000      0.140%           80,000          72  round_canonical             <qemu>/fpu/softfloat.c

The source code of the float64_div function which is the callee of helper_fdiv is attached in the appendix section of the report. Notice how the QEMU_FLATTEN attribute is used in the function, but it seems that GCC was able to inline all functions within to the maximum depth. Unlike Clang, which failed to do so with float64_round_pack_canonical causing the round_canonical function to be explicitly called thus costing extra instructions.

This is the reason behind the increase in the number of instructions per call for all targets and for most of the helpers, which proves that GCC does a better job in optimizing functions with __attribute__((flatten)). Since this is a compiler optimization problem, it might be fixed in future versions of Clang.

cpu_loop_exit:

By following the same procedure to analyze the helper_raise_exception_err, it can be seen that cpu_loop_exit uses a different function for performing long jump in Clang. This function (longjmp@GLIBC_2.2.5) executes nearly 3 times the number of instruction per call of the function which is used in the GCC build (__longjmp_chk).

This cpu_loop_exit behavior also appears in most of the other targets.

Integer and String benchmarks

These types of benchmarks (matmul_int32, qsort_int32 and qsort_string) use very few helpers compared to benchmarks involving floating point numbers. The change in performance is mainly attributed to code generation. Degradation in helpers still occurs and one of its reasons is the different long jump function used in the two builds which was discussed above.

QEMU Binary Size

To conclude the report, a small Python script is used to measure the size of the QEMU target binary for each of GCC and Clang. The last column in the output table is the percentage of change in the Clang binary size compared to GCC.

import os
import csv

def convert_bytes(n):
    for x in ['bytes', 'KB', 'MB', 'GB', 'TB']:
        if n < 1024.0:
            return "%3.1f %s" % (n, x)
        n /= 1024.0


builds = {
    "gcc": "<qemu-gcc-build>",
    "clang": "<qemu-clang-build>"
}

targets = ["aarch64", "alpha", "arm", "hppa", "m68k", "mips", "mipsel",
           "mips64", "mips64el", "ppc", "ppc64", "ppc64le", "riscv64",
           "s390x", "sh4", "sparc64", "x86_64"]

csv_headers = ["Target", "GCC", "Clang", "Difference %"]

with open("compare_exe.csv", "w") as file:
    writer = csv.writer(file)
    writer.writerow(csv_headers)

for target in targets:
    size = []

    for build_name, build_path in builds.items():
        size.append(os.path.getsize("{}/{}-linux-user/qemu-{}".format(build_path,
                                                                      target,
                                                                      target)))
    with open("compare_exe.csv", "a") as file:
        writer = csv.writer(file)
        writer.writerow([target, convert_bytes(size[0]), convert_bytes(
            size[1]), str(round(((size[1] - size[0]) / size[1]) * 100, 3))+"%"])

Results

Target GCC Clang Difference %
aarch64 27.6 MB 26.4 MB -4.722%
alpha 15.1 MB 11.9 MB -26.789%
arm 20.1 MB 17.7 MB -13.108%
hppa 15.4 MB 12.4 MB -24.662%
m68k 16.1 MB 12.9 MB -24.916%
mips 25.5 MB 23.7 MB -7.304%
mipsel 25.3 MB 23.7 MB -6.577%
mips64 25.8 MB 24.2 MB -6.743%
mips64el 25.7 MB 24.2 MB -6.067%
ppc 20.1 MB 19.2 MB -4.539%
ppc64 20.4 MB 19.7 MB -3.337%
ppc64le 20.2 MB 19.7 MB -2.52%
riscv64 20.1 MB 18.1 MB -10.943%
s390x 17.2 MB 13.7 MB -25.374%
sh4 15.0 MB 11.8 MB -27.197%
sparc64 15.8 MB 12.7 MB -24.964%
x86_64 17.2 MB 13.9 MB -23.38%

Appendix

float64_div Implementation

float64 QEMU_FLATTEN
float64_div(float64 a, float64 b, float_status *s)
{
    return float64_gen2(a, b, s, hard_f64_div, soft_f64_div,
                        f64_div_pre, f64_div_post);
}

static inline float64
float64_gen2(float64 xa, float64 xb, float_status *s,
              hard_f64_op2_fn hard, soft_f64_op2_fn soft,
              f64_check_fn pre, f64_check_fn post)
{
    union_float64 ua, ub, ur;

    ua.s = xa;
    ub.s = xb;

    if (unlikely(!can_use_fpu(s))) {
        goto soft;
    }

    float64_input_flush2(&ua.s, &ub.s, s);
    if (unlikely(!pre(ua, ub))) {
        goto soft;
    }

    ur.h = hard(ua.h, ub.h);
    if (unlikely(f64_is_inf(ur))) {
        s->float_exception_flags |= float_flag_overflow;
    } else if (unlikely(fabs(ur.h) <= DBL_MIN) && post(ua, ub)) {
        goto soft;
    }
    return ur.s;

  soft:
    return soft(ua.s, ub.s, s);
}

static float64 QEMU_SOFTFLOAT_ATTR
soft_f64_div(float64 a, float64 b, float_status *status)
{
    FloatParts pa = float64_unpack_canonical(a, status);
    FloatParts pb = float64_unpack_canonical(b, status);
    FloatParts pr = div_floats(pa, pb, status);

    return float64_round_pack_canonical(pr, status);
}

static float64 float64_round_pack_canonical(FloatParts p, float_status *s)
{
    return float64_pack_raw(round_canonical(p, s, &float64_params));
}

LinkedIn, Twitter, Facebook