Rocksolid Light

Welcome to novaBBS (click a section below)

mail  files  register  nodelist  faq  login

Don't compare floating point numbers solely for equality.


programming / comp.lang.asm.x86 / Problem with knucleotide benchmark

SubjectAuthor
o Problem with knucleotide benchmarkBranmimir Maksimovic

1
Subject: Problem with knucleotide benchmark
From: Branmimir Maksimovic
Newsgroups: comp.lang.asm.x86
Organization: A noiseless patient Spider
Date: Fri, 7 May 2021 00:38 UTC
Path: i2pn2.org!i2pn.org!eternal-september.org!reader02.eternal-september.org!.POSTED!not-for-mail
From: branimir...@nospicedham.gmail.com (Branmimir Maksimovic)
Newsgroups: comp.lang.asm.x86
Subject: Problem with knucleotide benchmark
Date: Fri, 7 May 2021 00:38:44 -0000 (UTC)
Organization: A noiseless patient Spider
Lines: 1123
Approved: fbkotler@myfairpoint.net - comp.lang.asm.x86 moderation team.
Message-ID: <s7226k$32j$1@dont-email.me>
Injection-Info: reader02.eternal-september.org; posting-host="427e9ac8a1a8212b552c0c364e5d6ad4";
logging-data="12681"; mail-complaints-to="abuse@eternal-september.org"; posting-account="U2FsdGVkX19NmUuGGvy3e+O7pCTUyEFs35zzg4dhork="
User-Agent: slrn/1.0.3 (Linux)
Cancel-Lock: sha1:nIvdwm3d0lTTdo+inubCVzGPifU=
View all headers
I have written this some 15 years ago, that is
pretty dirty.
Benchmark is described at
https://benchmarksgame-team.pages.debian.net/benchmarksgame/description/knucleotide.html#knucleotide
Program is fasm
compile like and run like:
~/shootout/knucleotide >>> fasm knucleotidef.asm                                                                                                                             [1]
flat assembler  version 1.73.27  (16384 kilobytes memory)
4 passes, 8612 bytes.
~/shootout/knucleotide >>> gcc knucleotidef.o -o knucleotidef -no-pie -m32
~/shootout/knucleotide >>> time ./knucleotidef 32 < input.txt
A 30.295
T 30.151
C 19.800
G 19.754

AA 9.177
TA 9.132
AT 9.131
TT 9.091
CA 6.002
AC 6.001
AG 5.987
GA 5.984
CT 5.971
TC 5.971
GT 5.957
TG 5.956
CC 3.917
GC 3.911
CG 3.909
GG 3.902

1471758GGT
446535 GGTA
47336 GGTATT
893 GGTATTTTAATT
893 GGTATTTTAATTTATAGT
../knucleotidef 32 < input.txt  28.72s user 0.22s system 331% cpu 8.730 total

Problem is that I made it for 4 threads, and I cannot for love of God
figure out how to increase that. Posted on several forums and no
response, seems that code is very complicated as I wrote it that way
long time ago. Now I would that rewrite much simpler without that
macros, but what to do now ;)

Here it is:

struc vector d,s
{
.data dd d
.size dd s
.elements dd 0
}

macro ccall proc,[arg]                  ; call CDECL procedure
{
  common
    local size
    size = 0  
   reverse  
    pushd arg
    size = size+4
   common
    call proc
    add esp,size
}

macro sys_exit rc
{
mov eax,1 ; exit
mov ebx,rc
int 0x80
}

macro sys_read fd, buf, size
{
mov eax, 3 ; sys_read
mov ebx, fd
mov ecx, buf
mov edx, size
int 0x80
}
macro sys_write fd, buf, size
{
mov eax, 4 ; sys_write
mov ebx, fd
mov ecx, buf
mov edx, size
int 0x80
}

CLONE_VM equ 0x00000100
CLONE_FS equ 0x00000200
CLONE_FILES equ 0x00000400
CLONE_SIGHAND equ 0x00000800
CLONE_THREAD equ 0x00010000

macro sys_clone stack
{
mov eax,120 ; sys_clone
mov ebx,CLONE_VM or CLONE_FS or CLONE_FILES \
or CLONE_SIGHAND;
mov ecx,stack ; choose stack
xor edx,edx ; no struct
int 0x80
}

__WNOTHREAD equ 0x20000000
__WALL equ 0x40000000
__WCLONE equ 0x80000000

macro sys_wait pid
{
mov ebx,pid
mov eax,114 ; sys_wait4
xor ecx,ecx
xor esi,esi
mov edx,__WALL;
int 0x80
}

macro read fd, buf,size
{
local l1,l2,l3
mov edi,buf
mov ebx,size
xor ecx,ecx
mov eax, dword [fptr]
and eax,eax
jnz l2
l1:
push ebx ecx edx edi
strncpy fileresbuf,filebuf,fsize,0
sys_read fd,filebuf,fsize
pop edi edx ecx ebx
and eax,eax
jz l3
lea eax, [eax+filebuf]
mov dword [fend], eax
mov dword [fptr], filebuf
l2:
mov eax, dword [fend]
sub eax, dword [fptr]
jz l1
cmp eax,ebx
cmovg eax,ebx
sub ebx, eax
add ecx,eax
push ecx
strncpy edi,dword [fptr], eax, 0
pop ecx
and ebx,ebx
mov dword [fptr],esi
jnz l1
l3:
mov eax,ecx
}

macro back size
{
sub dword[fptr],size
}

macro getLine fd, buf, size, hint
{
local l1,l2,l3
mov ecx, size
mov ebx,hint
cmp ecx,ebx
cmovl ebx,ecx
xor edx,edx
mov edi, buf
l1:
cmp ecx,0
jle l2
push ebx ecx edx edi
read fd,dword[esp],ebx
pop edi edx ecx ebx
add edx,eax
test eax,eax
jz l2;
sub ecx,eax
push ecx eax
mov ecx,eax
strnchr edi,0xa,ecx
pop eax ecx
cmp byte [edi-1], 0xa
jne l1
dec edi
l2:
mov byte [edi],0
sub edi,buf
mov eax,edx
dec eax
sub eax,edi
jle l3
back eax
l3:
}

macro strnchr s,c,count
{
mov edi,s
mov eax,c
mov ecx,count
cld
repne scasb
}

macro dwordnset s, c, count
{
mov edi,s
mov eax,c
mov ecx,count
cld
rep stosd
}

macro strnset s,c, size
{
mov edi,s
mov eax,c
mov ecx,size
cld
rep stosb
}

macro dwordncmp s1, s2, size, dir
{
if ~ dir
cld
else
std
end if
mov esi,s2
mov edi,s1
mov ecx,size
repe cmpsd
if dir
cld
end if
}

macro strncmp s1, s2, size, dir
{
if ~ dir
cld
else
std
end if
mov esi,s2
mov edi,s1
mov ecx,size
repe cmpsb
if dir
cld
end if
}

macro dwordncpy s1,s2, size, dir
{
if ~ dir
cld
else
std
end if
mov esi,s2
mov edi,s1
mov ecx, size
rep movsd
if dir
cld
end if
}

macro strncpy s1,s2, size, dir
{
if ~ dir
cld
else
std
end if
mov esi,s2
mov edi,s1
mov ecx, size
rep movsb
if dir
cld
end if
}

macro to_num src
{
mov al,src
xlatb
}

macro to_char src
{
mov al,src
xlatb
}

macro pack_str dst,src,size,f
{
if ~f
local l1
mov esi,src
mov edi,dst
mov ecx,size
mov edx,0
mov ebx,xtbl
l1:
to_num byte [esi]
mov byte [edi], al
inc edi
inc esi
inc edx
dec ecx
jnz l1
else
strncpy dst,src,size,0
end if
}

macro really_pack_str dst,src,size,f
{
local l1,l2,e1
mov esi,src
mov edi,dst
mov ecx,size
mov edx,1
l1:
mov ebx,4
mov byte [edi],0
l2:
push ebx
mov ebx,xtbl
to_num byte [esi]
pop ebx
shl byte [edi],2
or byte [edi], al
inc esi
dec ecx
jz e1
dec ebx
jnz l2
inc edi
inc edx ; count
jmp l1
e1:
}

macro unpack_str dst,src,size
{
local l1
mov esi,src
mov edi,dst
mov ecx,size
mov ebx,xtbl
l1:
to_char byte [esi]
mov byte [edi], al
inc edi
inc esi
dec ecx
jnz l1
}

macro initvector data,oldsize,size,block
{
local e1,e2
mov eax, size
imul eax, block
push eax
ccall realloc,dword[data],eax
pop ebx
and eax,eax
jz e1
mov dword[data],eax
mov dword[oldsize],ebx
jmp e2
e1:
ccall perror, err1
sys_exit -1
e2:
}

macro freevector data,size
{
local l1
mov ecx,size
mov ebx,data
l1:
push ebx ecx
ccall free,dword[ebx]
pop ecx ebx
mov dword[ebx],0
add ebx,4
dec ecx
jnz l1
}

macro hash str,size
{
local l1,s1,s2,e1,e2
mov ecx,size
mov ebx,str
mov edi,4
mov esi,16
xor eax,eax

cmp ecx,4
jle s1
pxor xmm2,xmm2
pcmpeqb xmm2,xmm1
pmovmskb edx,xmm2
xor dx,0xffff
je l1
cmp ecx,16
jg s2
sub ecx,4
add ebx,ecx
shl ecx,1
sub ecx,64
neg ecx
movd xmm2,ecx
psllq xmm1,xmm2
psrlq xmm1,xmm2
movd eax,xmm1
mov ecx,4
s1:
pxor xmm1,xmm1
jmp l1
s2:
movd edx,xmm1
psrldq xmm1,4
movd eax,xmm1
cmp ecx,20
jge s1
sub ecx,4
add ebx,ecx
sub ecx,12
mov edi,4
sub edi,ecx
mov esi,edi
shl ecx,1
mov edi,ecx
sub ecx,32
neg ecx
shl edx,cl
shld eax,edx,8
sub edi,8
neg edi
mov ecx,edi
shr eax,cl
mov ecx,4
mov edi,4
jmp s1
l1:
shl eax,2
movzx edx,byte[ebx]
or eax,edx
dec ecx
jle e1
inc ebx
dec esi
jnz l1
pslldq xmm1,4
movdqa xmm2,xmm1
movd xmm1,eax
por xmm1,xmm2
xor eax,eax
mov esi,16
dec edi
jz e2
jmp l1
e1:
pslldq xmm1,4
movdqa xmm2,xmm1
movd xmm1,eax
por xmm1,xmm2
dec edi
e2:
}

macro hashfind data,elements,block,srchstr,srchlen
{
mov eax,srchstr
movd xmm0,eax
hash srchstr,srchlen
mov ebx,data
strfind elements,block
}

macro strfind elements,block
{
local l1,l2,l3,l4,l5,s1,s2,s3,e1
movdqa xmm2,xmm1
mov edx,4
sub edx,edi
xor eax,eax
s2:
movd ecx,xmm2
shld eax,ecx,16
and ecx,0xffff
add eax,ecx
psrldq xmm2,4
dec edx
jnz s2
s3:
and eax,0xffff
xor esi,esi
shl eax,2
movd xmm2,eax
movd xmm3,ebx
cmp dword[ebx+eax],0
jne l3
l1:
; allocate
s1:
mov ebx,1
xor eax,eax
lock cmpxchg dword[sema],ebx ; test and set
and eax,eax
jnz s1
add esi,28
movd eax,xmm2
movd ebx,xmm3
ccall realloc,dword[ebx+eax],esi ; realloc is not thread safe
lock and dword[sema],0 ; reset
mov esi,eax
and esi,esi
jz e2
movd eax,xmm2
movd ebx,xmm3
cmp dword[ebx+eax],0
mov dword[ebx+eax],esi
jne l2
mov esi, dword[ebx+eax]
mov dword[esi],0
l2:
mov ebx,dword[ebx+eax]
add ebx,4
mov eax,dword[ebx-4]
imul eax,24
mov dword[ebx+eax],0
movd [ebx+eax+4],xmm0
movdqu [ebx+eax+8],xmm1
inc dword[elements]
inc dword[ebx-4]
jmp e1
;search
l3:
mov ebx,dword[ebx+eax]
add ebx,4
xor eax,eax

l4:
mov esi,dword[ebx-4]
imul esi,24
cmp eax,esi
jge l1 ; we need to reallocate

Click here to read the complete article
1
rocksolid light 0.7.2
clearneti2ptor