问题描述:

Following this thread ...

For this piece of code:

#include <stdio.h>

int main(void)

{

int i;

size_t u;

for (i = 0; i < 10; i++) {

u = (size_t)i;

printf("i = %d, u = %zu\n", i, u);

}

return 0;

}

The output in assembly is:

EDIT: Compiled with -O2

 .file "demo.c"

.section .rodata.str1.1,"aMS",@progbits,1

.LC0:

.string "i = %d, u = %zu\n"

.section .text.startup,"ax",@progbits

.p2align 4,,15

.globl main

.type main, @function

main:

.LFB3:

.cfi_startproc

pushq %rbx

.cfi_def_cfa_offset 16

.cfi_offset 3, -16

xorl %ebx, %ebx

.p2align 4,,10

.p2align 3

.L2:

movq %rbx, %rdx

movl %ebx, %esi

xorl %eax, %eax

movl $.LC0, %edi

addq $1, %rbx

call printf

cmpq $10, %rbx

jne .L2

xorl %eax, %eax

popq %rbx

.cfi_def_cfa_offset 8

ret

.cfi_endproc

.LFE3:

.size main, .-main

.ident "GCC: (Debian 4.7.2-5) 4.7.2"

.section .note.GNU-stack,"",@progbits

Is the conversion u = (size_t)i; consuming extra cycles?

网友答案:

yes, it does, as it changes the internal representation from 32bit to 64bit. specifically,

.L3:
    movl    -4(%rbp), %eax
    cltq
    movq    %rax, -16(%rbp)
    movq    -16(%rbp), %rdx

reads i, performs sign-extension and copying to %rdx. i'm unsure why this value has to pass through the stack - as mats pointed out, this looks like code from a non-noptimizing compiler run.

EDIT

in the optimized assembly code, the loop counter is maintained as the wider data type. afair, movs between registers don't differ in run-time cycles wrt quad or dword (indeed they don't: see table C-16 in intels pertinent doc, referenced by this SO post.

网友答案:

Yes, as the code is posted, certainly. Your conversion is here:

movl    -4(%rbp), %eax
cltq
movq    %rax, -16(%rbp)

Of course, this code is unoptimized, so it's not a very fair comparison. If you compile it with optimization, the compiler may realize that the values are always positive and just do a single move from whatever register holds i to %rdx that holds the third argument.

Edit:

As suspected, there is essentially no overhead in the optimized code. In this case, the compiler has converted the loop to count up u, and derive i from u instead of the other way around, so %rbx is used for the loop, and the value of i is just using %ebx, which is the lower 32 bits of %rbx - so there is no overhead in this example. I emphasise this, since there may well be other cases where converting from int to size_t will have a penalty. It completely depends on the circumstances.

网友答案:

Not sure if this is the actual assignment that's consuming cycles for you i believe this is the assignment thats consuming cycles

for example looc at this t1.c

#include <stdio.h>

int main(void)
{
    int i;
    size_t u;

    for (i = 0; i < 10; i++) {
        printf("i = %d, u = %zu\n", i, u);
    }
    return 0;
}

and the assmebly for t1.c

        .file   "t1.c"
        .section        .rodata
.LC0:
        .string "i = %d, u = %zu\n"
        .text
.globl main
        .type   main, @function
main:
        pushl   %ebp
        movl    %esp, %ebp
        andl    $-16, %esp
        subl    $32, %esp
        movl    $0, 24(%esp)
        jmp     .L2
.L3:
        movl    $.LC0, %eax
        movl    28(%esp), %edx
        movl    %edx, 8(%esp)
        movl    24(%esp), %edx
        movl    %edx, 4(%esp)
        movl    %eax, (%esp)
        call    printf
        addl    $1, 24(%esp)
.L2:
        cmpl    $9, 24(%esp)
        jle     .L3
        movl    $0, %eax
        leave
        ret
        .size   main, .-main
        .ident  "GCC: (GNU) 4.4.6 20110731 (Red Hat 4.4.6-3)"
        .section        .note.GNU-stack,"",@progbits

in the above case no assignment atall for its ok for now

second case t2.c

#include <stdio.h>

int main(void)
{
    int i;
    size_t u;

    for (i = 0; i < 10; i++) {
        i = (size_t) u;
        printf("i = %d, u = %zu\n", i, u);
    }
    return 0;
}

and the subsequent assmebly

        .file   "t2.c"
        .section        .rodata
.LC0:
        .string "i = %d, u = %zu\n"
        .text
.globl main
        .type   main, @function
main:
        pushl   %ebp
        movl    %esp, %ebp
        andl    $-16, %esp
        subl    $32, %esp
        movl    $0, 24(%esp)
        jmp     .L2
.L3:
        movl    28(%esp), %eax
        movl    %eax, 24(%esp)
        movl    $.LC0, %eax
        movl    28(%esp), %edx
        movl    %edx, 8(%esp)
        movl    24(%esp), %edx
        movl    %edx, 4(%esp)
        movl    %eax, (%esp)
        call    printf
        addl    $1, 24(%esp)
.L2:
        cmpl    $9, 24(%esp)
        jle     .L3
        movl    $0, %eax
        leave
        ret
        .size   main, .-main
        .ident  "GCC: (GNU) 4.4.6 20110731 (Red Hat 4.4.6-3)"
        .section        .note.GNU-stack,"",@progbits

Check the statements above

movl    28(%esp), %eax
movl    %eax, 24(%esp)

now for the last example t3.c

#include <stdio.h>

int main(void)
{
    int i;
    int u;

    for (i = 0; i < 10; i++) {
        i = u;
        printf("i = %d, u = %zu\n", i, u);
    }
    return 0;
}

and the subsequent assembly

        .file   "t3.c"
        .section        .rodata
.LC0:
        .string "i = %d, u = %zu\n"
        .text
.globl main
        .type   main, @function
main:
        pushl   %ebp
        movl    %esp, %ebp
        andl    $-16, %esp
        subl    $32, %esp
        movl    $0, 24(%esp)
        jmp     .L2
.L3:
        movl    28(%esp), %eax
        movl    %eax, 24(%esp)
        movl    $.LC0, %eax
        movl    28(%esp), %edx
        movl    %edx, 8(%esp)
        movl    24(%esp), %edx
        movl    %edx, 4(%esp)
        movl    %eax, (%esp)
        call    printf
        addl    $1, 24(%esp)
.L2:
        cmpl    $9, 24(%esp)
        jle     .L3
        movl    $0, %eax
        leave
        ret
        .size   main, .-main
        .ident  "GCC: (GNU) 4.4.6 20110731 (Red Hat 4.4.6-3)"
        .section        .note.GNU-stack,"",@progbits

Now you can observe t2 and t3 and see the difference here, but really varies from arch to arch though

相关阅读:
Top