Performance Comparison of Two QEMU Builds (Compilers and QEMU Performance)

GCC VS Clang

Ahmed Karaman - July 27, 2020

Intro

This report presents a performance comparison between two different QEMU builds, GCC and Clang. To provide a variety of test workloads, five new benchmarks are also introduced in the report. For each benchmark, the performance of QEMU is compared across the two builds for seventeen different targets.

The compiler versions used in the report are the default available versions in the Ubuntu 18.0 repositories which are 7.5.0 for GCC and 6.0.0 for Clang.

Benchmarks Overview
Setup and Prerequisites
Performance Tables
Analysis of Results
QEMU Binary Size
Appendix

Benchmarks Overview

This section gives a quick overview of the five new benchmarks used in the report. All benchmarks are available on the project GitHub page.

matmult_double:

Standard matrix multiplication of an n*n matrix of randomly generated double numbers from 0 to 100. The value of n is passed as an argument with the -n flag. The default value is 200.

matmult_int32:

Standard matrix multiplication of an n*n matrix of randomly generated integer numbers from 0 to 100. The value of n is passed as an argument with the -n flag. The default value is 200.

qsort_double:

Quick sort of an array of n randomly generated double numbers from 0 to 1000. The value of n is passed as an argument with the -n flag. The default value is 300000.

qsort_int32:

Quick sort of an array of n randomly generated integer numbers from 0 to 50000000. The value of n is passed as an argument with the -n flag. The default value is 300000.

qsort_string:

Quick sort of an array of 10000 randomly generated strings of size 8 (including null terminating character). The sort process is repeated n number of times. The value of n is passed as an argument with the -n flag. The default value is 20.

Setup and Prerequisites

All previous reports assumed a GCC build on top of QEMU 5.0.0. Given that QEMU 5.1.0-rc1 is now released, the report will be based on this newer QEMU version.

To download and make the GCC and Clang builds for QEMU, you can run the bash snippet below:

wget https://download.qemu.org/qemu-5.1.0-rc1.tar.xz
tar xfv qemu-5.1.0-rc1.tar.xz
cd qemu-5.1.0-rc1
mkdir build-gcc
cd build-gcc
../configure
make
cd ..
mkdir build-clang
../configure --cc=clang --cxx=clang++
make

The report will assume that the GCC build is at <qemu-gcc-build> and Clang build is at <qemu-clang-build>.

To measure the performance of the seventeen targets for all of the five benchmarks and the two QEMU builds, the Python script below is used. The script runs dissect.py (which was modified to print the number of instructions instead of percentages) on each of the five benchmarks for the two different builds. The results are ten tables, two for each benchmark.

The script assumes that all five benchmarks are in a benchmarks directory.

import csv
import os
import subprocess


############### Script Options ###############
builds = {
    "gcc": "<qemu-gcc-build>",
    "clang": "<qemu-clang-build>"
}
targets = {
    "aarch64":  "aarch64-linux-gnu-gcc",
    "alpha":    "alpha-linux-gnu-gcc",
    "arm":      "arm-linux-gnueabi-gcc",
    "hppa":     "hppa-linux-gnu-gcc",
    "m68k":     "m68k-linux-gnu-gcc",
    "mips":     "mips-linux-gnu-gcc",
    "mipsel":   "mipsel-linux-gnu-gcc",
    "mips64":   "mips64-linux-gnuabi64-gcc",
    "mips64el": "mips64el-linux-gnuabi64-gcc",
    "ppc":      "powerpc-linux-gnu-gcc",
    "ppc64":    "powerpc64-linux-gnu-gcc",
    "ppc64le":  "powerpc64le-linux-gnu-gcc",
    "riscv64":  "riscv64-linux-gnu-gcc",
    "s390x":    "s390x-linux-gnu-gcc",
    "sh4":      "sh4-linux-gnu-gcc",
    "sparc64":  "sparc64-linux-gnu-gcc",
    "x86_64":   "gcc"
}
##############################################

benchmarks = sorted(os.listdir("benchmarks"))

csv_headers = ["Target", "Total Instructions",
               "Code Generation", "JIT Execution", "Helpers"]


for benchmark in benchmarks:
    # Remove file extension
    benchmark_name = os.path.splitext(benchmark)[0]

    for build_name, _ in builds.items():
        with open("tables/{}-{}.csv".format(benchmark_name, build_name), "w") as csv_file:
            writer = csv.writer(csv_file)
            writer.writerow(csv_headers)

    for target_name, target_compiler in targets.items():
        compile_target = subprocess.run([target_compiler,
                                         "-O2",
                                         "-static",
                                         "benchmarks/"+benchmark,
                                         "-o",
                                         "/tmp/" + benchmark_name])

        for build_name, build_path in builds.items():
            dissect_target = subprocess.run(["./dissect.py",
                                             "--",
                                             "{}/{}-linux-user/qemu-{}".
                                             format(build_path,
                                                    target_name,
                                                    target_name),
                                             "/tmp/" + benchmark_name],
                                            stdout=subprocess.PIPE)
            # Read the dissect output
            lines = dissect_target.stdout.decode("utf-8").split('\n')

            # Extract measurements
            total_instructions = lines[0].split()[-1]
            code_generation = lines[2].split()[-2]
            jit_execution = lines[3].split()[-2]
            helpers_execution = lines[4].split()[-2]

            # Save output to CSV
            with open("tables/{}-{}.csv".format(benchmark_name, build_name), "a") as csv_file:
                writer = csv.writer(csv_file)
                writer.writerow([target_name,
                                 total_instructions,
                                 code_generation,
                                 jit_execution,
                                 helpers_execution])
    # Remove temporary file
    os.unlink("/tmp/" + benchmark_name)

The ten resulting tables are shown in the next section. For Clang tables, numbers in green indicate a decrease in the number of instructions, and numbers in red indicate otherwise.

Performance Tables

matmult_double (GCC)

Target	Total Instructions	Code Generation	JIT Execution	Helpers
aarch64	1 411 957 967	76 120 077	611 067 999	724 769 891
alpha	3 020 752 954	56 143 853	437 106 648	2 527 502 453
arm	8 721 987 230	285 174 602	6 633 909 955	1 802 902 673
hppa	3 345 168 851	179 176 839	348 961 284	2 817 030 728
m68k	3 327 223 458	65 872 033	547 273 911	2 714 077 514
mips	2 262 859 230	79 312 482	362 219 758	1 821 326 990
mipsel	3 176 135 194	79 192 739	402 266 174	2 694 676 281
mips64	2 276 881 008	87 067 585	364 561 337	1 825 252 086
mips64el	3 189 604 541	86 891 148	404 111 884	2 698 601 509
ppc	3 125 097 209	82 035 165	338 132 356	2 704 929 688
ppc64	3 202 929 198	88 654 395	379 358 099	2 734 916 704
ppc64le	3 202 302 439	88 841 776	378 540 677	2 734 919 986
riscv64	1 222 310 471	60 743 087	305 444 092	856 123 292
s390x	2 726 250 005	57 481 812	318 742 202	2 350 025 991
sh4	3 341 872 364	67 475 044	602 524 473	2 671 872 847
sparc64	3 205 825 118	134 723 352	501 019 705	2 570 082 061
x86_64	1 249 941 832	67 613 673	364 007 574	818 320 585

matmult_double (Clang)

Target	Total Instructions	Code Generation	JIT Execution	Helpers
aarch64	2 011 166 477	71 984 566	611 067 999	1 328 113 912
alpha	4 673 820 741	55 207 732	437 106 648	4 181 506 361
arm	8 746 454 169	276 672 219	6 633 909 955	1 835 871 995
hppa	4 770 815 514	175 521 902	348 961 284	4 246 332 328
m68k	3 542 927 301	65 164 233	547 273 911	2 930 489 157
mips	3 738 305 471	72 287 768	362 219 758	3 303 797 945
mipsel	5 239 048 897	72 499 086	402 266 174	4 764 283 637
mips64	3 750 825 939	77 896 420	364 561 337	3 308 368 182
mips64el	5 251 030 988	78 065 089	404 111 884	4 768 854 015
ppc	4 109 476 668	81 662 092	338 132 356	3 689 682 220
ppc64	4 170 489 431	87 779 750	379 358 099	3 703 351 582
ppc64le	4 169 627 378	87 731 465	378 540 677	3 703 355 236
riscv64	1 799 067 648	59 198 155	305 444 092	1 434 425 401
s390x	3 924 840 571	58 080 261	318 742 202	3 548 018 108
sh4	5 069 649 275	65 326 027	602 524 473	4 401 798 775
sparc64	4 918 273 993	131 200 185	501 019 705	4 286 054 103
x86_64	2 282 484 944	66 749 639	364 007 574	1 851 727 731

matmult_int32 (GCC)

Target	Total Instructions	Code Generation	JIT Execution	Helpers
aarch64	595 494 162	69 380 056	505 728 486	20 385 620
alpha	370 047 513	50 451 964	307 209 296	12 386 253
arm	735 549 496	276 195 690	410 431 931	48 921 875
hppa	666 639 906	171 974 055	454 423 118	40 242 733
m68k	406 939 070	59 081 632	329 037 444	18 819 994
mips	497 096 141	71 679 287	416 946 656	8 470 198
mipsel	497 010 030	71 506 947	417 032 765	8 470 318
mips64	478 992 426	78 298 258	388 302 800	12 391 368
mips64el	462 357 562	78 074 494	371 890 705	12 392 363
ppc	338 417 174	74 914 476	255 198 622	8 304 076
ppc64	390 129 095	80 972 586	297 023 711	12 132 798
ppc64le	390 053 461	80 990 281	296 928 184	12 134 996
riscv64	349 030 315	54 578 504	281 826 149	12 625 662
s390x	491 822 152	51 853 248	375 436 514	64 532 390
sh4	399 132 791	59 096 149	312 448 882	27 587 760
sparc64	488 710 835	123 407 698	353 794 723	11 508 414
x86_64	399 168 070	61 136 986	322 978 674	15 052 410

matmult_int32 (Clang)

Target	Total Instructions	Code Generation	JIT Execution	Helpers
aarch64	593 751 025	65 694 887	505 728 486	22 327 652
alpha	371 067 721	50 014 259	307 209 296	13 844 166
arm	727 777 546	267 935 229	410 431 931	49 410 386
hppa	664 473 385	168 673 213	454 423 118	41 377 054
m68k	408 524 487	58 641 545	329 037 444	20 845 498
mips	491 762 457	65 531 396	416 946 656	9 284 405
mipsel	491 996 655	65 679 356	417 032 765	9 284 534
mips64	472 458 134	70 305 442	388 302 800	13 849 892
mips64el	456 176 576	70 434 870	371 890 705	13 851 001
ppc	339 482 792	75 166 396	255 198 622	9 117 774
ppc64	391 412 561	80 800 275	297 023 711	13 588 575
ppc64le	391 115 113	80 595 928	296 928 184	13 591 001
riscv64	349 721 940	53 570 099	281 826 149	14 325 692
s390x	481 356 004	52 685 433	375 436 514	53 234 057
sh4	399 736 614	57 671 842	312 448 882	29 615 890
sparc64	486 470 314	121 641 278	353 794 723	11 034 313
x86_64	399 194 875	60 675 932	322 978 674	15 540 269

qsort_double (GCC)

Target	Total Instructions	Code Generation	JIT Execution	Helpers
aarch64	2 658 154 250	79 040 798	1 265 579 424	1 313 534 028
alpha	1 949 114 474	57 399 955	869 643 481	1 022 071 038
arm	9 118 694 070	850 895 346	4 265 464 995	4 002 333 729
hppa	3 138 372 515	496 702 308	1 023 062 954	1 618 607 253
m68k	4 385 213 371	63 926 269	1 224 197 193	3 097 089 909
mips	2 098 316 452	82 146 339	942 372 869	1 073 797 244
mipsel	2 098 510 236	81 980 330	942 732 894	1 073 797 012
mips64	1 970 640 902	90 221 912	787 224 479	1 093 194 511
mips64el	1 968 095 838	90 070 316	784 830 991	1 093 194 531
ppc	2 735 890 533	115 892 326	1 087 846 009	1 532 152 198
ppc64	2 684 919 199	122 371 298	1 028 369 561	1 534 178 340
ppc64le	2 641 863 052	122 545 885	985 137 467	1 534 179 700
riscv64	1 589 964 563	62 644 918	643 559 932	883 759 713
s390x	2 474 989 116	58 933 857	660 401 279	1 755 653 980
sh4	2 562 375 399	68 574 230	1 138 226 176	1 355 574 993
sparc64	3 917 963 038	2 055 155 359	1 298 625 002	564 182 677
x86_64	1 986 765 860	68 884 527	804 594 605	1 113 286 728

qsort_double (Clang)

Target	Total Instructions	Code Generation	JIT Execution	Helpers
aarch64	2 744 907 796	74 594 511	1 265 579 424	1 404 733 861
alpha	2 070 204 037	56 286 633	869 643 481	1 144 273 923
arm	9 163 293 622	822 921 830	4 265 464 995	4 074 906 797
hppa	3 094 186 179	484 467 463	1 023 062 954	1 586 655 762
m68k	4 114 339 886	63 107 725	1 224 197 193	2 827 034 968
mips	2 240 757 989	74 728 757	942 372 869	1 223 656 363
mipsel	2 241 264 168	74 875 161	942 732 894	1 223 656 113
mips64	2 125 667 762	80 578 997	787 224 479	1 257 864 286
mips64el	2 123 478 152	80 782 855	784 830 991	1 257 864 306
ppc	3 265 774 088	120 968 560	1 087 846 009	2 056 959 519
ppc64	3 198 309 630	127 098 740	1 028 369 561	2 042 841 329
ppc64le	3 155 027 150	127 046 877	985 137 467	2 042 842 806
riscv64	1 667 477 270	60 857 834	643 559 932	963 059 504
s390x	2 404 573 110	59 331 077	660 401 279	1 684 840 754
sh4	2 609 811 199	66 186 107	1 138 226 176	1 405 398 916
sparc64	4 189 185 205	2 351 778 131	1 298 625 002	538 782 072
x86_64	1 988 312 647	67 801 893	804 594 605	1 115 916 149

qsort_int32 (GCC)

Target	Total Instructions	Code Generation	JIT Execution	Helpers
aarch64	2 131 720 142	72 399 376	1 055 781 197	1 003 539 569
alpha	1 460 366 380	52 934 795	796 943 218	610 488 367
arm	3 372 434 621	843 360 743	1 078 158 662	1 450 915 216
hppa	2 196 495 498	491 615 054	868 099 497	836 780 947
m68k	1 779 675 824	58 332 907	966 478 982	754 863 935
mips	1 499 858 843	74 197 186	842 808 363	582 853 294
mipsel	1 502 516 358	74 066 097	845 597 211	582 853 050
mips64	1 498 104 595	84 972 133	802 161 902	610 970 560
mips64el	1 477 116 988	84 679 402	781 628 546	610 809 040
ppc	1 668 038 700	109 657 874	975 751 506	582 629 320
ppc64	1 779 152 045	115 936 847	1 072 790 643	590 424 555
ppc64le	1 727 703 061	115 826 109	1 021 451 041	590 425 911
riscv64	1 289 198 318	57 502 772	624 840 792	606 854 754
s390x	2 114 306 901	53 466 647	692 707 638	1 368 132 616
sh4	1 878 429 484	61 060 371	913 781 294	903 587 819
sparc64	3 352 057 480	2 022 774 129	1 141 078 790	188 204 561
x86_64	1 751 081 171	63 007 727	765 175 073	922 898 371

qsort_int32 (Clang)

Target	Total Instructions	Code Generation	JIT Execution	Helpers
aarch64	2 223 343 530	68 447 052	1 055 781 197	1 099 115 281
alpha	1 531 490 101	52 235 802	796 943 218	682 311 081
arm	3 370 059 365	815 660 694	1 078 158 662	1 476 240 009
hppa	2 212 542 138	479 758 595	868 099 497	864 684 046
m68k	1 853 277 157	57 802 833	966 478 982	828 995 342
mips	1 549 431 653	67 726 753	842 808 363	638 896 537
mipsel	1 552 417 791	67 924 305	845 597 211	638 896 275
mips64	1 559 572 362	76 178 736	802 161 902	681 231 724
mips64el	1 539 491 325	76 254 175	781 628 546	681 608 604
ppc	1 730 823 937	115 323 201	975 751 506	639 749 230
ppc64	1 855 390 899	121 324 924	1 072 790 643	661 275 332
ppc64le	1 803 732 493	121 004 644	1 021 451 041	661 276 808
riscv64	1 369 588 518	56 200 727	624 840 792	688 546 999
s390x	2 018 273 542	54 160 605	692 707 638	1 271 405 299
sh4	1 943 201 025	59 392 201	913 781 294	970 027 530
sparc64	3 626 612 511	2 317 128 970	1 141 078 790	168 404 751
x86_64	1 759 411 677	62 347 319	765 175 073	931 889 285

qsort_string (GCC)

Target	Total Instructions	Code Generation	JIT Execution	Helpers
aarch64	2 530 266 115	71 878 428	1 449 639 434	1 008 748 253
alpha	1 793 598 941	52 114 495	1 117 230 092	624 254 354
arm	7 155 712 165	3 673 959 444	1 747 578 444	1 734 174 277
hppa	4 596 662 435	2 077 290 104	1 188 975 849	1 330 396 482
m68k	2 295 149 555	58 023 587	1 343 485 580	893 640 388
mips	2 113 525 462	74 968 779	1 501 957 602	536 599 081
mipsel	2 110 264 854	74 820 076	1 498 845 813	536 598 965
mips64	1 968 388 319	81 540 025	1 278 967 522	607 880 772
mips64el	1 950 703 742	81 332 799	1 261 490 179	607 880 764
ppc	2 428 873 715	269 273 748	1 616 960 349	542 639 618
ppc64	2 404 019 471	273 403 312	1 361 586 990	769 029 169
ppc64le	2 385 236 674	273 482 576	1 342 609 438	769 144 660
riscv64	1 563 526 901	56 535 790	880 912 912	626 078 199
s390x	3 934 202 536	52 728 806	868 450 778	3 013 022 952
sh4	2 097 991 097	61 043 883	1 155 615 736	881 331 478
sparc64	4 130 814 212	2 078 206 607	1 572 858 282	479 749 323
x86_64	2 864 486 422	63 565 441	1 160 432 349	1 640 488 632

qsort_string (Clang)

Target	Total Instructions	Code Generation	JIT Execution	Helpers
aarch64	2 622 482 230	68 022 754	1 449 639 434	1 104 820 042
alpha	1 866 463 476	51 536 810	1 117 230 092	697 696 574
arm	7 056 218 429	3 549 558 538	1 747 578 444	1 759 081 447
hppa	4 584 536 797	2 023 729 867	1 188 975 849	1 371 831 081
m68k	2 391 766 444	57 534 808	1 343 485 580	990 746 056
mips	2 158 611 852	68 459 450	1 501 957 602	588 194 800
mipsel	2 155 675 997	68 635 509	1 498 845 813	588 194 675
mips64	2 031 507 933	73 143 579	1 278 967 522	679 396 832
mips64el	2 014 166 270	73 279 267	1 261 490 179	679 396 824
ppc	2 516 388 621	303 589 290	1 616 960 349	595 838 982
ppc64	2 384 468 962	308 642 876	1 361 586 990	714 239 096
ppc64le	2 365 387 612	308 506 861	1 342 609 438	714 271 313
riscv64	1 646 638 770	55 367 615	880 912 912	710 358 243
s390x	3 475 927 973	53 526 410	868 450 778	2 553 950 785
sh4	2 161 178 295	59 427 892	1 155 615 736	946 134 667
sparc64	4 410 070 085	2 377 799 055	1 572 858 282	459 412 748
x86_64	2 874 783 326	62 956 885	1 160 432 349	1 651 394 092

Analysis of Results

Comparison Script

To facilitate the analysis, another Python script can be used to compare Clang performance to that of GCC for each benchmark. The result would be five tables (one for each benchmark) with the percentage of increase/decrease in Clang instructions compared to GCC.

The script assumes that all the tables from the previous section are in a tables directory.

import os
import csv


def calculate_change(gcc_instructions, clang_instructions):
    # Calculate the percentage of change in Clang instructions compared to GCC
    percentage = round(((clang_instructions - gcc_instructions) /
                        gcc_instructions) * 100, 3)
    return "+" + str(percentage) + "%" if percentage > 0 else str(percentage) + "%"


tables = sorted(os.listdir("tables"))

csv_headers = ["Target", "Total Instructions %",
               "Code Generation %", "JIT Execution %", "Helpers %"]

for i in range(0, len(tables), 2):
    benchmark_name = tables[i].split("-")[0]

    # Extract data from tables
    clang_data, gcc_data = [], []
    with open("tables/" + tables[i], "r") as file:
        clang_data = file.readlines()
    with open("tables/" + tables[i+1], "r") as file:
        gcc_data = file.readlines()

    with open(benchmark_name + "-compare.csv", "w") as file:
        writer = csv.writer(file)
        writer.writerow(csv_headers)

    for l in range(1, len(gcc_data)):
        gcc_split = gcc_data[l].split('"')
        clang_split = clang_data[l].split('"')

        target_name = gcc_split[0][:-1]

        gcc_instructions = int(gcc_split[1].replace(",", ""))
        clang_instructions = int(clang_split[1].replace(",", ""))
        instructions_change = calculate_change(
            gcc_instructions, clang_instructions)

        gcc_code_generation = int(gcc_split[3].replace(",", ""))
        clang_code_generation = int(clang_split[3].replace(",", ""))
        code_generation_change = calculate_change(
            gcc_code_generation, clang_code_generation)

        gcc_jit = int(gcc_split[5].replace(",", ""))
        clang_jit = int(clang_split[5].replace(",", ""))
        jit_change = calculate_change(gcc_jit, clang_jit)

        gcc_helpers = int(gcc_split[7].replace(",", ""))
        clang_helpers = int(clang_split[7].replace(",", ""))
        helpers_change = calculate_change(gcc_helpers, clang_helpers)

        with open(benchmark_name + "-compare.csv", "a") as file:
            writer = csv.writer(file)
            writer.writerow([
                target_name,
                instructions_change,
                code_generation_change,
                jit_change,
                helpers_change
            ])
    i += 2

matmult_double

Target	Total Instructions %	Code Generation %	JIT Execution %	Helpers %
aarch64	+42.438%	-5.433%	0.0%	+83.246%
alpha	+54.724%	-1.667%	0.0%	+65.44%
arm	+0.281%	-2.981%	0.0%	+1.829%
hppa	+42.618%	-2.04%	0.0%	+50.738%
m68k	+6.483%	-1.075%	0.0%	+7.974%
mips	+65.203%	-8.857%	0.0%	+81.395%
mipsel	+64.95%	-8.452%	0.0%	+76.804%
mips64	+64.735%	-10.533%	0.0%	+81.255%
mips64el	+64.63%	-10.158%	0.0%	+76.716%
ppc	+31.499%	-0.455%	0.0%	+36.406%
ppc64	+30.209%	-0.987%	0.0%	+35.41%
ppc64le	+30.207%	-1.25%	0.0%	+35.41%
riscv64	+47.186%	-2.543%	0.0%	+67.549%
s390x	+43.965%	+1.041%	0.0%	+50.978%
sh4	+51.701%	-3.185%	0.0%	+64.746%
sparc64	+53.417%	-2.615%	0.0%	+66.767%
x86_64	+82.607%	-1.278%	0.0%	+126.284%

matmult_int32

Target	Total Instructions %	Code Generation %	JIT Execution %	Helpers %
aarch64	-0.293%	-5.312%	0.0%	+9.526%
alpha	+0.276%	-0.868%	0.0%	+11.77%
arm	-1.057%	-2.991%	0.0%	+0.999%
hppa	-0.325%	-1.919%	0.0%	+2.819%
m68k	+0.39%	-0.745%	0.0%	+10.763%
mips	-1.073%	-8.577%	0.0%	+9.613%
mipsel	-1.009%	-8.15%	0.0%	+9.613%
mips64	-1.364%	-10.208%	0.0%	+11.77%
mips64el	-1.337%	-9.785%	0.0%	+11.77%
ppc	+0.315%	+0.336%	0.0%	+9.799%
ppc64	+0.329%	-0.213%	0.0%	+11.999%
ppc64le	+0.272%	-0.487%	0.0%	+11.998%
riscv64	+0.198%	-1.848%	0.0%	+13.465%
s390x	-2.128%	+1.605%	0.0%	-17.508%
sh4	+0.151%	-2.41%	0.0%	+7.352%
sparc64	-0.458%	-1.431%	0.0%	-4.12%
x86_64	+0.007%	-0.754%	0.0%	+3.241%

qsort_double

Target	Total Instructions %	Code Generation %	JIT Execution %	Helpers %
aarch64	+3.264%	-5.625%	0.0%	+6.943%
alpha	+6.213%	-1.94%	0.0%	+11.956%
arm	+0.489%	-3.288%	0.0%	+1.813%
hppa	-1.408%	-2.463%	0.0%	-1.974%
m68k	-6.177%	-1.28%	0.0%	-8.72%
mips	+6.788%	-9.03%	0.0%	+13.956%
mipsel	+6.803%	-8.667%	0.0%	+13.956%
mips64	+7.867%	-10.688%	0.0%	+15.063%
mips64el	+7.895%	-10.311%	0.0%	+15.063%
ppc	+19.368%	+4.38%	0.0%	+34.253%
ppc64	+19.121%	+3.863%	0.0%	+33.155%
ppc64le	+19.424%	+3.673%	0.0%	+33.155%
riscv64	+4.875%	-2.853%	0.0%	+8.973%
s390x	-2.845%	+0.674%	0.0%	-4.033%
sh4	+1.851%	-3.483%	0.0%	+3.675%
sparc64	+6.923%	+14.433%	0.0%	-4.502%
x86_64	+0.078%	-1.572%	0.0%	+0.236%

qsort_int32

Target	Total Instructions %	Code Generation %	JIT Execution %	Helpers %
aarch64	+4.298%	-5.459%	0.0%	+9.524%
alpha	+4.87%	-1.32%	0.0%	+11.765%
arm	-0.07%	-3.284%	0.0%	+1.745%
hppa	+0.731%	-2.412%	0.0%	+3.335%
m68k	+4.136%	-0.909%	0.0%	+9.82%
mips	+3.305%	-8.721%	0.0%	+9.615%
mipsel	+3.321%	-8.292%	0.0%	+9.615%
mips64	+4.103%	-10.349%	0.0%	+11.5%
mips64el	+4.223%	-9.95%	0.0%	+11.591%
ppc	+3.764%	+5.166%	0.0%	+9.804%
ppc64	+4.285%	+4.647%	0.0%	+12.0%
ppc64le	+4.401%	+4.471%	0.0%	+12.0%
riscv64	+6.236%	-2.264%	0.0%	+13.462%
s390x	-4.542%	+1.298%	0.0%	-7.07%
sh4	+3.448%	-2.732%	0.0%	+7.353%
sparc64	+8.191%	+14.552%	0.0%	-10.52%
x86_64	+0.476%	-1.048%	0.0%	+0.974%

qsort_string

Target	Total Instructions %	Code Generation %	JIT Execution %	Helpers %
aarch64	+3.645%	-5.364%	0.0%	+9.524%
alpha	+4.062%	-1.108%	0.0%	+11.765%
arm	-1.39%	-3.386%	0.0%	+1.436%
hppa	-0.264%	-2.578%	0.0%	+3.114%
m68k	+4.21%	-0.842%	0.0%	+10.866%
mips	+2.133%	-8.683%	0.0%	+9.615%
mipsel	+2.152%	-8.266%	0.0%	+9.615%
mips64	+3.207%	-10.297%	0.0%	+11.765%
mips64el	+3.253%	-9.902%	0.0%	+11.765%
ppc	+3.603%	+12.744%	0.0%	+9.804%
ppc64	-0.813%	+12.889%	0.0%	-7.125%
ppc64le	-0.832%	+12.807%	0.0%	-7.134%
riscv64	+5.316%	-2.066%	0.0%	+13.462%
s390x	-11.648%	+1.513%	0.0%	-15.236%
sh4	+3.012%	-2.647%	0.0%	+7.353%
sparc64	+6.76%	+14.416%	0.0%	-4.239%
x86_64	+0.359%	-0.957%	0.0%	+0.665%

Floating Point Benchmarks

For all five benchmarks, most targets had a decrease in the number of instructions spent in code generation, however, there was a major increase in the number of instructions spent in the execution of helpers.

To find out the reason behind this increase, the list_helpers.py script can be used with the matmul_double benchmark (which had the biggest decrease in Clang performance) and any of the seventeen targets to list the executed helpers.

List helpers of ppc for matmult_double on GCC:

./list_helpers.py -- <qemu-gcc-build>/ppc-linux-user/qemu-ppc matmult_double-ppc

Results:

Executed QEMU Helpers:

 No.     Instructions  Percentage            Calls    Ins/Call  Helper Name                   Source File
----  ---------------  ----------  ---------------  ----------  -------------------------     ------------------------------
  2,088,642,242     66.832%        8,000,000         261  helper_fmadd                  <qemu>/target/ppc/fpu_helper.c
    420,240,000     13.447%        8,240,000          51  helper_compute_fprf_float64   <qemu>/target/ppc/fpu_helper.c
    139,760,120      4.472%        8,240,008          16  helper_float_check_status     <qemu>/target/ppc/fpu_helper.c
     16,480,024      0.527%        8,240,012           2  helper_reset_fpstatus         <qemu>/include/fpu/softfloat-helpers.h
     11,167,515      0.357%           80,000         139  helper_fmul                   <qemu>/target/ppc/fpu_helper.c
     10,320,000      0.330%           80,000         129  helper_fsub                   <qemu>/target/ppc/fpu_helper.c
     10,000,000      0.320%           80,000         125  helper_fdiv                   <qemu>/target/ppc/fpu_helper.c
      8,314,772      0.266%          162,603          51  helper_lookup_tb_ptr          <qemu>/accel/tcg/tcg-runtime.c
          2,618      0.000%               14         187  helper_dcbz                   <qemu>/target/ppc/mem_helper.c
          1,494      0.000%               18          83  helper_raise_exception_err    <qemu>/target/ppc/excp_helper.c
          1,012      0.000%                8         126  helper_fcmpu                  <qemu>/target/ppc/fpu_helper.c

List helpers of ppc for matmult_double on Clang:

./list_helpers.py -- <qemu-clang-build>/ppc-linux-user/qemu-ppc matmult_double-ppc

Results:

Executed QEMU Helpers:

 No.     Instructions  Percentage            Calls    Ins/Call  Helper Name                   Source File
----  ---------------  ----------  ---------------  ----------  -------------------------     ------------------------------
  3,040,716,864     73.990%        8,000,000         380  helper_fmadd                  <qemu>/target/ppc/fpu_helper.c
    403,760,000      9.825%        8,240,000          49  helper_compute_fprf_float64   <qemu>/target/ppc/fpu_helper.c
    164,480,144      4.002%        8,240,008          19  helper_float_check_status     <qemu>/target/ppc/fpu_helper.c
     18,800,000      0.457%           80,000         235  helper_fsub                   <qemu>/target/ppc/fpu_helper.c
     18,230,012      0.444%           80,000         227  helper_fmul                   <qemu>/target/ppc/fpu_helper.c
     18,080,000      0.440%           80,000         226  helper_fdiv                   <qemu>/target/ppc/fpu_helper.c
     16,480,024      0.401%        8,240,012           2  helper_reset_fpstatus         <qemu>/include/fpu/softfloat-helpers.h
      9,127,473      0.222%          162,603          56  helper_lookup_tb_ptr          <qemu>/include/exec/exec-all.h
          3,774      0.000%               18         209  helper_raise_exception_err    <qemu>/target/ppc/excp_helper.c
          2,492      0.000%               14         178  helper_dcbz                   <qemu>/target/ppc/mem_helper.c
          1,544      0.000%                8         193  helper_fcmpu                  <qemu>/target/ppc/fpu_helper.c

All floating point helpers had an increase in their number of instructions per call. The list_fn_callees.py script can be utilized to analyse one of these helpers.

List callees of helper_fdiv on GCC:

./list_fn_callees.py -f helper_fdiv-- <qemu-gcc-build>/ppc-linux-user/qemu-ppc matmult_double-ppc

Results:

Callees of helper_fdiv:

 No.     Instructions  Percentage            Calls    Ins/Call  Function Name              Source File
----  ---------------  ----------  ---------------  ----------  -------------------------  ------------------------------
   1        7,840,000      0.251%           80,000          98  float64_div                 <qemu>/include/qemu/bitops.h

List callees of helper_fdiv on Clang:

./list_fn_callees.py -f helper_fdiv-- <qemu-clang-build>/ppc-linux-user/qemu-ppc matmult_double-ppc

Results:

Callees of helper_fdiv:

 No.     Instructions  Percentage            Calls    Ins/Call  Function Name              Source File
----  ---------------  ----------  ---------------  ----------  -------------------------  ------------------------------
   1       15,200,000      0.370%           80,000         190  float64_div                 <qemu>/fpu/softfloat.c

List callees of float64_div on GCC:

./list_fn_callees.py -f float64_div-- <qemu-gcc-build>/ppc-linux-user/qemu-ppc matmult_double-ppc

Results:

Couldn't locate function: float64_div.

List callees of float64_div on Clang:

./list_fn_callees.py -f float64_div-- <qemu-clang-build>/ppc-linux-user/qemu-ppc matmult_double-ppc

Callees of float64_div:

 No.     Instructions  Percentage            Calls    Ins/Call  Function Name              Source File
----  ---------------  ----------  ---------------  ----------  -------------------------  ------------------------------
   1        5,760,000      0.140%           80,000          72  round_canonical             <qemu>/fpu/softfloat.c

The source code of the float64_div function which is the callee of helper_fdiv is attached in the appendix section of the report. Notice how the QEMU_FLATTEN attribute is used in the function, but it seems that GCC was able to inline all functions within to the maximum depth. Unlike Clang, which failed to do so with float64_round_pack_canonical causing the round_canonical function to be explicitly called thus costing extra instructions.

This is the reason behind the increase in the number of instructions per call for all targets and for most of the helpers, which proves that GCC does a better job in optimizing functions with __attribute__((flatten)). Since this is a compiler optimization problem, it might be fixed in future versions of Clang.

cpu_loop_exit:

By following the same procedure to analyze the helper_raise_exception_err, it can be seen that cpu_loop_exit uses a different function for performing long jump in Clang. This function (longjmp@GLIBC_2.2.5) executes nearly 3 times the number of instruction per call of the function which is used in the GCC build (__longjmp_chk).

This cpu_loop_exit behavior also appears in most of the other targets.

Integer and String benchmarks

These types of benchmarks (matmul_int32, qsort_int32 and qsort_string) use very few helpers compared to benchmarks involving floating point numbers. The change in performance is mainly attributed to code generation. Degradation in helpers still occurs and one of its reasons is the different long jump function used in the two builds which was discussed above.

QEMU Binary Size

To conclude the report, a small Python script is used to measure the size of the QEMU target binary for each of GCC and Clang. The last column in the output table is the percentage of change in the Clang binary size compared to GCC.

import os
import csv

def convert_bytes(n):
    for x in ['bytes', 'KB', 'MB', 'GB', 'TB']:
        if n < 1024.0:
            return "%3.1f %s" % (n, x)
        n /= 1024.0


builds = {
    "gcc": "<qemu-gcc-build>",
    "clang": "<qemu-clang-build>"
}

targets = ["aarch64", "alpha", "arm", "hppa", "m68k", "mips", "mipsel",
           "mips64", "mips64el", "ppc", "ppc64", "ppc64le", "riscv64",
           "s390x", "sh4", "sparc64", "x86_64"]

csv_headers = ["Target", "GCC", "Clang", "Difference %"]

with open("compare_exe.csv", "w") as file:
    writer = csv.writer(file)
    writer.writerow(csv_headers)

for target in targets:
    size = []

    for build_name, build_path in builds.items():
        size.append(os.path.getsize("{}/{}-linux-user/qemu-{}".format(build_path,
                                                                      target,
                                                                      target)))
    with open("compare_exe.csv", "a") as file:
        writer = csv.writer(file)
        writer.writerow([target, convert_bytes(size[0]), convert_bytes(
            size[1]), str(round(((size[1] - size[0]) / size[1]) * 100, 3))+"%"])

Results

Target	GCC	Clang	Difference %
aarch64	27.6 MB	26.4 MB	-4.722%
alpha	15.1 MB	11.9 MB	-26.789%
arm	20.1 MB	17.7 MB	-13.108%
hppa	15.4 MB	12.4 MB	-24.662%
m68k	16.1 MB	12.9 MB	-24.916%
mips	25.5 MB	23.7 MB	-7.304%
mipsel	25.3 MB	23.7 MB	-6.577%
mips64	25.8 MB	24.2 MB	-6.743%
mips64el	25.7 MB	24.2 MB	-6.067%
ppc	20.1 MB	19.2 MB	-4.539%
ppc64	20.4 MB	19.7 MB	-3.337%
ppc64le	20.2 MB	19.7 MB	-2.52%
riscv64	20.1 MB	18.1 MB	-10.943%
s390x	17.2 MB	13.7 MB	-25.374%
sh4	15.0 MB	11.8 MB	-27.197%
sparc64	15.8 MB	12.7 MB	-24.964%
x86_64	17.2 MB	13.9 MB	-23.38%

Appendix

float64_div Implementation

float64 QEMU_FLATTEN
float64_div(float64 a, float64 b, float_status *s)
{
    return float64_gen2(a, b, s, hard_f64_div, soft_f64_div,
                        f64_div_pre, f64_div_post);
}

static inline float64
float64_gen2(float64 xa, float64 xb, float_status *s,
              hard_f64_op2_fn hard, soft_f64_op2_fn soft,
              f64_check_fn pre, f64_check_fn post)
{
    union_float64 ua, ub, ur;

    ua.s = xa;
    ub.s = xb;

    if (unlikely(!can_use_fpu(s))) {
        goto soft;
    }

    float64_input_flush2(&ua.s, &ub.s, s);
    if (unlikely(!pre(ua, ub))) {
        goto soft;
    }

    ur.h = hard(ua.h, ub.h);
    if (unlikely(f64_is_inf(ur))) {
        s->float_exception_flags |= float_flag_overflow;
    } else if (unlikely(fabs(ur.h) <= DBL_MIN) && post(ua, ub)) {
        goto soft;
    }
    return ur.s;

  soft:
    return soft(ua.s, ub.s, s);
}

static float64 QEMU_SOFTFLOAT_ATTR
soft_f64_div(float64 a, float64 b, float_status *status)
{
    FloatParts pa = float64_unpack_canonical(a, status);
    FloatParts pb = float64_unpack_canonical(b, status);
    FloatParts pr = div_floats(pa, pb, status);

    return float64_round_pack_canonical(pr, status);
}

static float64 float64_round_pack_canonical(FloatParts p, float_status *s)
{
    return float64_pack_raw(round_canonical(p, s, &float64_params));
}

Share this report: LinkedIn, Twitter, Facebook

TCG Continuous Benchmarking

Performance Comparison of Two QEMU Builds (Compilers and QEMU Performance)

Intro

Table of Contents

Benchmarks Overview

Setup and Prerequisites

Performance Tables

matmult_double (GCC)

matmult_double (Clang)

matmult_int32 (GCC)

matmult_int32 (Clang)

qsort_double (GCC)

qsort_double (Clang)

qsort_int32 (GCC)

qsort_int32 (Clang)

qsort_string (GCC)

qsort_string (Clang)

Analysis of Results

Comparison Script

matmult_double

matmult_int32

qsort_double

qsort_int32

qsort_string

Floating Point Benchmarks

Integer and String benchmarks

QEMU Binary Size

Results

Appendix

float64_div Implementation