Rocksolid Light

Welcome to novaBBS (click a section below)

mail  files  register  nodelist  faq  login

We are experiencing system trouble -- do not adjust your terminal.


programming / comp.lang.asm.x86 / RTM-question

SubjectAuthor
* RTM-questionBonita Montero
`- Re: RTM-questionMelzzzzz

1
Subject: RTM-question
From: Bonita Montero
Newsgroups: comp.lang.asm.x86
Organization: albasani.net
Date: Fri, 27 Sep 2019 20:01 UTC
Path: i2pn2.org!i2pn.org!eternal-september.org!feeder.eternal-september.org!reader01.eternal-september.org!.POSTED!not-for-mail
From: Bonita.M...@nospicedham.gmail.com (Bonita Montero)
Newsgroups: comp.lang.asm.x86
Subject: RTM-question
Date: Fri, 27 Sep 2019 22:01:33 +0200
Organization: albasani.net
Lines: 200
Approved: fbkotler@myfairpoint.net - comp.lang.asm.x86 moderation team.
Message-ID: <qmlpqt$l9j$1@news.albasani.net>
Mime-Version: 1.0
Content-Type: text/plain; charset=utf-8; format=flowed
Content-Transfer-Encoding: 7bit
Injection-Info: reader02.eternal-september.org; posting-host="1b77af106a740515d4a7e63bf7e304ae";
logging-data="2468"; mail-complaints-to="abuse@eternal-september.org"; posting-account="U2FsdGVkX1/rqTPUMKN1U8vlWbIYRQnGZaH+LpiLOaQ="
User-Agent: Mozilla/5.0 (Windows NT 10.0; WOW64; rv:68.0) Gecko/20100101
Thunderbird/68.1.1
Cancel-Lock: sha1:suIV4pnXwIoYvyYdGeJN5L1imeA=
View all headers
That's not really asm I want to ask for, but a question about TSX/RTM,
so this is x86-architecture-related. I want to test whether TSX/RTM
could be faster when having an atomic operation on a size_t-sized
operand than LOCK XADD or LOCK CMPXCHG.

So here's the test-code:

#if defined(_MSC_VER)
     #include <Windows.h>
     #include <intrin.h>
#elif defined(__unix__)
     #include <sys/sysinfo.h>
     #include <sched.h>
     #include <pthread.h>
     #include <immintrin.h>
#endif
#include <iostream>
#include <thread>
#include <cstddef>
#include <atomic>
#include <functional>
#include <chrono>
#include <vector>
#include <cstdlib>
#include <cmath>
#include <array>

bool hasTSX();

using namespace std;
using namespace chrono;

inline
size_t fetchAdd( size_t volatile &v, size_t a )
{
#if defined(_MSC_VER)
     #if defined(_M_X64)
     return (size_t)_InterlockedExchangeAdd64( &(__int64 &)v, (__int64)a );
     #elif defined(_M_IX86)
     return (size_t)_InterlockedExchangeAdd( &(long &)v, (long)a );
     #else
         #error unsupported architecture
     #endif
#elif defined(__GNUC__) || defined(__clang__)
     return __sync_fetch_and_add( &v, a );
#else
         #error unsupported architecture
#endif
}

inline
bool rtmFetchAdd( size_t volatile &v, size_t a )
{
     if( _xbegin() == _XBEGIN_STARTED )
     {
         v += a;
         _xend();
         return true;
     }
     else
         return false;
}

inline
size_t compareExchange( size_t volatile &v, size_t c, size_t x )
{
#if defined(_MSC_VER)
     #if defined(_M_X64)
     return (size_t)_InterlockedCompareExchange64( &(__int64 &)v, (__int64)x, (__int64)c );
     #elif defined(_M_IX86)
     return (size_t)_InterlockedCompareExchange( &(long &)v, (long)x, (long)c );
     #else
         #error unsupported architecture
     #endif
#elif defined(__GNUC__) || defined(__clang__)
     return __sync_val_compare_and_swap( &v, c, x );
#else
         #error unsupported architecture
#endif
}

int main( int argc, char **argv )
{
     if( argc < 2 )
         return -1;
     double nsPerClockCycle = 1.0 / (atof( argv[1] ) * 1.0e9);

     auto thrXadd = []( uint8_t volatile &run, size_t adds, size_t volatile &atm, atomic<size_t> &misses )
     {
         while( !run );
         for( size_t i = adds; i; --i )
             fetchAdd( atm, 1 );
     };
     auto thrXchg = []( uint8_t volatile &run, size_t adds, size_t volatile &atm, atomic<size_t> &misses )
     {
         while( !run );
         size_t missed = 0;
         for( size_t i = adds, cmp = atm; i; --i )
         {
             for( size_t res; ; )
                 if( (res = compareExchange( atm, cmp, cmp + 1 )) == cmp )
                 {
                     cmp = cmp + 1;
                     break;
                 }
                 else
                     cmp = res,
                     ++missed;
         }
         misses.fetch_add( missed );
     };
     auto rtmAdd = []( uint8_t volatile &run, size_t adds, size_t volatile &atm, atomic<size_t> &misses )
     {
         while( !run );
         size_t missed = 0;
         for( size_t i = adds; i; --i )
             while( !rtmFetchAdd( atm, 1 ) )
                 ++missed;
         misses.fetch_add( missed );
     };
     using threadfunc = void (*)( uint8_t volatile &, size_t, size_t volatile &, atomic<size_t> & );
     array<threadfunc, 3>   atf;
     array<char const *, 3> threadDescr;
     size_t                 nTests;
     size_t const           ADDS = 10'000'000;
     unsigned               nProcessors = thread::hardware_concurrency();

     atf[0]         = thrXadd;
     atf[1]         = thrXchg;
     atf[2]         = rtmAdd;
     threadDescr[0] = "xadd-thread";
     threadDescr[1] = "cmpxchge-thread";
     threadDescr[2] = "rtm-thread";
     nTests         = hasTSX() ? atf.size() : atf.size() - 1;

     for( size_t m = 0; m != nTests; ++m )
     {
         cout << threadDescr[m] << ":" << endl;
         for( unsigned nThreads = 1; nThreads <= nProcessors; ++nThreads )
         {
             atomic<size_t> misses( 0 );
             uint8_t        run = false;
             size_t         atm;
             vector<thread> threads;
             for( unsigned i = 0; i != nThreads; ++i )
             {
                 threads.emplace_back( atf[m], ref( run ), ADDS, ref( atm ), ref( misses ) );
#if defined(_MSC_VER)
                 SetThreadAffinityMask( threads[i].native_handle(), (DWORD_PTR)1 << i );
#elif defined(__unix__)
                 cpu_set_t cpuset;
                 CPU_ZERO(&cpuset);
                 CPU_SET(i, &cpuset);
                 pthread_setaffinity_np( threads[i].native_handle(), sizeof cpuset, &cpuset );
#endif
             }
             time_point<high_resolution_clock> start = high_resolution_clock::now();
             run = true;
             for( unsigned i = 0; i != nThreads; ++i )
                 threads[i].join();
             uint64_t ns = (uint64_t)duration_cast<nanoseconds>( high_resolution_clock::now() - start ).count();;

             double nsPerAdd = (double)ns / nThreads / ADDS / 1.0e9;
             cout << "threads: " << nThreads << " cycles: " << nsPerAdd / nsPerClockCycle << " misses-ratio: " << (int)(100.0 * (size_t)misses / nThreads / ADDS) << "%" << endl;
         }
         cout << endl;
     }
}

bool hasTSX()
{
#if defined(_MSC_VER)
     int regs[4];
     __cpuidex( regs, 7, 0 );
     return regs[1] & (1 << 11);
#else
     return true;
#endif
}

So can anyone here compile this with MSVC++ or gcc / clang on a Skylake
or newer CPU with TSX and give me the output? With gcc / clang you need
the compiler-option "-mrtm" to enable RTM.
When running the program you need to give the base-clock of the CPU. The
Program gives an estimate (because the real clock may vary because of
bootsing) of the clock-cycles spent on each successful increment.



Subject: Re: RTM-question
From: Melzzzzz
Newsgroups: comp.lang.asm.x86
Organization: usenet-news.net
Date: Tue, 15 Oct 2019 05:07 UTC
References: 1
Path: i2pn2.org!i2pn.org!eternal-september.org!feeder.eternal-september.org!reader01.eternal-september.org!.POSTED!not-for-mail
From: Melzz...@nospicedham.zzzzz.com (Melzzzzz)
Newsgroups: comp.lang.asm.x86
Subject: Re: RTM-question
Date: Tue, 15 Oct 2019 05:07:03 GMT
Organization: usenet-news.net
Lines: 43
Approved: fbkotler@myfairpoint.net - comp.lang.asm.x86 moderation team.
Message-ID: <XtcpF.895857$t92.416664@fx25.am4>
References: <qmlpqt$l9j$1@news.albasani.net>
Mime-Version: 1.0
Content-Type: text/plain; charset=utf-8
Content-Transfer-Encoding: 8bit
Injection-Info: reader02.eternal-september.org; posting-host="2aa05ce9203ff75ff2869c33a5072e0a";
logging-data="20512"; mail-complaints-to="abuse@eternal-september.org"; posting-account="U2FsdGVkX1/1uTBQD8tg3pvJmwdzUtb9ysvmggvNJEo="
User-Agent: slrn/1.0.3 (Linux)
Cancel-Lock: sha1:1AllUb2LwBNJrB/MSPMRthSRQnA=
View all headers
On 2019-09-27, Bonita Montero <Bonita.Montero@nospicedham.gmail.com> wrote:

bool hasTSX()
{
#if defined(_MSC_VER)
     int regs[4];
     __cpuidex( regs, 7, 0 );
     return regs[1] & (1 << 11);
#else
     return true;
#endif
}

void cpuid(int CPUInfo[4],int InfoType){
    __asm__ __volatile__ (
        "cpuid":
        "=a" (CPUInfo[0]),
        "=b" (CPUInfo[1]),
        "=c" (CPUInfo[2]),
        "=d" (CPUInfo[3]) :
        "a" (InfoType), "c" (0)
    );
}
bool rtmSupported()
{
    int info[4];
    cpuid(info, 0);
    int nIds = info[0];
    if (nIds >= 0x00000001){
        cpuid(info,0x00000007);

        return (info[2] & ((int)1 <<  11)) != 0;
    }
    return false;
}
Someone should correct me if I am wrong...

--
press any key to continue or any other to quit...
U ničemu ja ne uživam kao u svom statusu INVALIDA -- Zli Zec
Na divljem zapadu i nije bilo tako puno nasilja, upravo zato jer su svi
bili naoruzani. -- Mladen Gogala



1
rocksolid light 0.7.2
clearneti2ptor