A friend asked me this programming multiple choice question that’s been driving me nuts. Not because it was a particularly hard question, but it was just full of holes.
Given this code, what is the run-time complexity:
sum = 0
for(i = 0; i < n^2; i++)
for(j=0; j < n; j++)
for(k=0; k <j; k++)
sum++;
It didn’t take me long to figure out that the “correct” answer is that this code has the time complexity of O(n^4). Simple cs 101.
Reasoning:
The first loop takes n^2 time. Check.
The second and third loop run like this:
when j = 0, k =0, but k < j doesn’t hold. 0 runs.
when j = 1, k = 0, -> 1 run.
when j = 2, k = 0, 1, –> 2 runs.
when j = 3, k = 0, 1, 2…. -> 3 runs.
This turns out to be a summation formula: sum(1 + 2+ 3+ 4+ 5+ … n-1), which is (n-1)*(n-1+1)/2.
Combining this with the first loop, the run time is O(n^2 * (n-1)*n/2).
All in all, the time complexity is O(n^4).
But this could be wrong based under certain assumptions.
Breaking Assumption 1: Suppose that this is the second time running this code. Or the third time. If the computer running this program caches the result(s) in the computer cache, then it’s a O(c) operation. Constant time lookup. Woohoo! Done.
Breaking Assumption 2: I heard a story that clang or other compilers are super-smart. They don’t run this code, they figure out that the above code can be converted into summation formulas. So the answer is always going n^3(n+1)/2. The computer doesn’t need to run any loops! Which is probably ~a couple of load(1?) operation from the computer, and maybe 4 multiplication operations, 1 addition, and 1 division. Probably less than 20 operations total. Which is still constant amount of cpu instructions regardless of n. So it’s still constant time operations independent of the size of n.
So yes, it’s O(n^4) under certain conditions. Ugh. But considering that Big O notation is taking the asymptotic bound for the worst case situation, I guess this is the safe, right answer. But under certain conditions, there is no way the asymptotic behavior would ever be O(n^4). It really depends on your environment and other extra details.
I was super curious of what optimizations the compiler makes, and coded this up on my computer in C and generated some assembly afterwards.
#include <stdio.h>
#include <stdlib.h>
int main(int argc,char* argv[])
{
int n = atoi(argv[1]);
int i, j, k = 0;
int sum = 0;
for (i = 0; i < n*n; i++){
for (j = 0; j < n; j++){
for (k = 0; k < j; k++){
sum++;
}
}
}
printf("%d", sum);
return 0;
}
Then I converted this to assembly with clang and gcc, and started digging around:
gcc, no optimizations:
.file "dumbq.c"
.section .rodata
.LC0:
.string "%d"
.text
.globl main
.type main, @function
main:
.LFB2:
.cfi_startproc
pushq %rbp
.cfi_def_cfa_offset 16
.cfi_offset 6, -16
movq %rsp, %rbp
.cfi_def_cfa_register 6
subq $48, %rsp
movl %edi, -36(%rbp)
movq %rsi, -48(%rbp)
movq -48(%rbp), %rax
addq $8, %rax
movq (%rax), %rax
movq %rax, %rdi
call atoi
movl %eax, -4(%rbp)
movl $0, -12(%rbp)
movl $0, -8(%rbp)
movl $0, -20(%rbp)
jmp .L2
.L7:
movl $0, -16(%rbp)
jmp .L3
.L6:
movl $0, -12(%rbp)
jmp .L4
.L5:
addl $1, -8(%rbp)
addl $1, -12(%rbp)
.L4:
movl -12(%rbp), %eax
cmpl -16(%rbp), %eax
jl .L5
addl $1, -16(%rbp)
.L3:
movl -16(%rbp), %eax
cmpl -4(%rbp), %eax
jl .L6
addl $1, -20(%rbp)
.L2:
movl -4(%rbp), %eax
imull -4(%rbp), %eax
cmpl -20(%rbp), %eax
jg .L7
movl -8(%rbp), %eax
movl %eax, %esi
movl $.LC0, %edi
movl $0, %eax
call printf
movl $0, %eax
leave
.cfi_def_cfa 7, 8
ret
.cfi_endproc
.LFE2:
.size main, .-main
.ident "GCC: (Ubuntu 5.4.0-6ubuntu1~16.04.11) 5.4.0 20160609"
.section .note.GNU-stack,"",@progbits
clang, no optimizations:
.text
.intel_syntax noprefix
.file "dumbq.c"
.globl main
.align 16, 0x90
.type main,@function
main: # @main
.cfi_startproc
# BB#0:
push rbp
.Ltmp0:
.cfi_def_cfa_offset 16
.Ltmp1:
.cfi_offset rbp, -16
mov rbp, rsp
.Ltmp2:
.cfi_def_cfa_register rbp
sub rsp, 48
mov dword ptr [rbp - 4], 0
mov dword ptr [rbp - 8], edi
mov qword ptr [rbp - 16], rsi
mov rsi, qword ptr [rbp - 16]
mov rdi, qword ptr [rsi + 8]
call atoi
mov dword ptr [rbp - 20], eax
mov dword ptr [rbp - 32], 0
mov dword ptr [rbp - 36], 0
mov dword ptr [rbp - 24], 0
.LBB0_1: # =>This Loop Header: Depth=1
# Child Loop BB0_3 Depth 2
# Child Loop BB0_5 Depth 3
mov eax, dword ptr [rbp - 24]
mov ecx, dword ptr [rbp - 20]
imul ecx, dword ptr [rbp - 20]
cmp eax, ecx
jge .LBB0_12
# BB#2: # in Loop: Header=BB0_1 Depth=1
mov dword ptr [rbp - 28], 0
.LBB0_3: # Parent Loop BB0_1 Depth=1
# => This Loop Header: Depth=2
# Child Loop BB0_5 Depth 3
mov eax, dword ptr [rbp - 28]
cmp eax, dword ptr [rbp - 20]
jge .LBB0_10
# BB#4: # in Loop: Header=BB0_3 Depth=2
mov dword ptr [rbp - 32], 0
.LBB0_5: # Parent Loop BB0_1 Depth=1
# Parent Loop BB0_3 Depth=2
# => This Inner Loop Header: Depth=3
mov eax, dword ptr [rbp - 32]
cmp eax, dword ptr [rbp - 28]
jge .LBB0_8
# BB#6: # in Loop: Header=BB0_5 Depth=3
mov eax, dword ptr [rbp - 36]
add eax, 1
mov dword ptr [rbp - 36], eax
# BB#7: # in Loop: Header=BB0_5 Depth=3
mov eax, dword ptr [rbp - 32]
add eax, 1
mov dword ptr [rbp - 32], eax
jmp .LBB0_5
.LBB0_8: # in Loop: Header=BB0_3 Depth=2
jmp .LBB0_9
.LBB0_9: # in Loop: Header=BB0_3 Depth=2
mov eax, dword ptr [rbp - 28]
add eax, 1
mov dword ptr [rbp - 28], eax
jmp .LBB0_3
.LBB0_10: # in Loop: Header=BB0_1 Depth=1
jmp .LBB0_11
.LBB0_11: # in Loop: Header=BB0_1 Depth=1
mov eax, dword ptr [rbp - 24]
add eax, 1
mov dword ptr [rbp - 24], eax
jmp .LBB0_1
.LBB0_12:
movabs rdi, .L.str
mov esi, dword ptr [rbp - 36]
mov al, 0
call printf
xor esi, esi
mov dword ptr [rbp - 40], eax # 4-byte Spill
mov eax, esi
add rsp, 48
pop rbp
ret
.Lfunc_end0:
.size main, .Lfunc_end0-main
.cfi_endproc
.type .L.str,@object # @.str
.section .rodata.str1.1,"aMS",@progbits,1
.L.str:
.asciz "%d"
.size .L.str, 3
.ident "clang version 3.8.0-2ubuntu4 (tags/RELEASE_380/final)"
.section ".note.GNU-stack","",@progbits
gcc O3 optimization:
.file "dumbq.c"
.section .rodata.str1.1,"aMS",@progbits,1
.LC0:
.string "%d"
.section .text.unlikely,"ax",@progbits
.LCOLDB1:
.section .text.startup,"ax",@progbits
.LHOTB1:
.p2align 4,,15
.globl main
.type main, @function
main:
.LFB38:
.cfi_startproc
subq $8, %rsp
.cfi_def_cfa_offset 16
movq 8(%rsi), %rdi
movl $10, %edx
xorl %esi, %esi
call strtol
movl %eax, %edi
xorl %edx, %edx
xorl %r8d, %r8d
imull %eax, %eax
testl %eax, %eax
je .L3
.p2align 4,,10
.p2align 3
.L10:
xorl %esi, %esi
testl %edi, %edi
jle .L4
leal 1(%rsi), %ecx
cmpl %edi, %ecx
je .L4
.p2align 4,,10
.p2align 3
.L16:
testl %ecx, %ecx
jle .L6
leal 1(%rdx,%rsi), %edx
.L6:
movl %ecx, %esi
leal 1(%rsi), %ecx
cmpl %edi, %ecx
jne .L16
.L4:
addl $1, %r8d
cmpl %r8d, %eax
jne .L10
.L3:
movl $.LC0, %esi
movl $1, %edi
xorl %eax, %eax
call __printf_chk
xorl %eax, %eax
addq $8, %rsp
.cfi_def_cfa_offset 8
ret
.cfi_endproc
.LFE38:
.size main, .-main
.section .text.unlikely
.LCOLDE1:
.section .text.startup
.LHOTE1:
.ident "GCC: (Ubuntu 5.4.0-6ubuntu1~16.04.11) 5.4.0 20160609"
.section .note.GNU-stack,"",@progbits
clang O3 optimization:
.text
.intel_syntax noprefix
.file "dumbq.c"
.globl main
.align 16, 0x90
.type main,@function
main: # @main
.cfi_startproc
# BB#0:
push rbx
.Ltmp0:
.cfi_def_cfa_offset 16
.Ltmp1:
.cfi_offset rbx, -16
mov rdi, qword ptr [rsi + 8]
xor ebx, ebx
xor esi, esi
mov edx, 10
call strtol
mov ecx, eax
imul ecx, ecx
test ecx, ecx
je .LBB0_3
# BB#1:
test eax, eax
jle .LBB0_3
# BB#2: # %.preheader.lr.ph.us.preheader
lea edx, [rax - 1]
lea esi, [rax - 2]
imul rsi, rdx
shr rsi
lea edx, [rax + rsi - 1]
test ecx, ecx
mov edi, 1
cmovg edi, ecx
dec edi
imul edi, edx
add edi, eax
lea ebx, [rsi + rdi - 1]
.LBB0_3: # %._crit_edge10
mov edi, .L.str
xor eax, eax
mov esi, ebx
call printf
xor eax, eax
pop rbx
ret
.Lfunc_end0:
.size main, .Lfunc_end0-main
.cfi_endproc
.type .L.str,@object # @.str
.section .rodata.str1.1,"aMS",@progbits,1
.L.str:
.asciz "%d"
.size .L.str, 3
.ident "clang version 3.8.0-2ubuntu4 (tags/RELEASE_380/final)"
.section ".note.GNU-stack","",@progbits
Results:
Without optimizations, the assembly code follows the logic and loops. With optimizations, it’s skipping all this with xor computations, and reduces the number of instructions. But it still doesn’t seem to use the obvious summation formulas(not quite sure…)… =(. OK - It’s not a couple of cpu operations. Unfortunately, I’m super rusty in my assembly, so I’ll have to explore this a a bit more another time. But I can almost swear that this code can be written in assembly in a couple of instructions and should not be dependent on n…