Rocksolid Light

Welcome to novaBBS (click a section below)

mail  files  register  newsreader  groups  login

Message-ID:  

MOUNT TAPE U1439 ON B3, NO RING


computers / comp.arch / Re: My experience with Apple M1 chip

Re: My experience with Apple M1 chip

<c5a7429b-b7e6-4fbc-ac98-bf5160c3a87dn@googlegroups.com>

  copy mid

https://www.novabbs.com/computers/article-flat.php?id=18476&group=comp.arch#18476

  copy link   Newsgroups: comp.arch
X-Received: by 2002:a05:620a:1026:: with SMTP id a6mr26310181qkk.331.1625684057409;
Wed, 07 Jul 2021 11:54:17 -0700 (PDT)
X-Received: by 2002:a4a:d781:: with SMTP id c1mr19334913oou.23.1625684057173;
Wed, 07 Jul 2021 11:54:17 -0700 (PDT)
Path: i2pn2.org!i2pn.org!weretis.net!feeder6.news.weretis.net!news.snarked.org!border2.nntp.dca1.giganews.com!nntp.giganews.com!news-out.google.com!nntp.google.com!postnews.google.com!google-groups.googlegroups.com!not-for-mail
Newsgroups: comp.arch
Date: Wed, 7 Jul 2021 11:54:17 -0700 (PDT)
In-Reply-To: <187875de-0cd7-4e6e-b4a9-71a9eb1f5527n@googlegroups.com>
Injection-Info: google-groups.googlegroups.com; posting-host=87.68.182.191; posting-account=ow8VOgoAAAAfiGNvoH__Y4ADRwQF1hZW
NNTP-Posting-Host: 87.68.182.191
References: <T6VEI.159$VU3.17@fx46.iad> <19dcc459-6eb5-4191-a186-c50d12ed347fn@googlegroups.com>
<2l%EI.8$gE.2@fx21.iad> <80ac50a0-dde2-4a66-b09c-62663cd5b4aan@googlegroups.com>
<SJGdnUhI6OG5N3n9nZ2dnUU7-ffNnZ2d@giganews.com> <187875de-0cd7-4e6e-b4a9-71a9eb1f5527n@googlegroups.com>
User-Agent: G2/1.0
MIME-Version: 1.0
Message-ID: <c5a7429b-b7e6-4fbc-ac98-bf5160c3a87dn@googlegroups.com>
Subject: Re: My experience with Apple M1 chip
From: already5...@yahoo.com (Michael S)
Injection-Date: Wed, 07 Jul 2021 18:54:17 +0000
Content-Type: text/plain; charset="UTF-8"
Lines: 270
 by: Michael S - Wed, 7 Jul 2021 18:54 UTC

On Wednesday, July 7, 2021 at 2:16:50 AM UTC+3, Michael S wrote:
> On Tuesday, July 6, 2021 at 10:08:27 PM UTC+3, Kent Dickey wrote:
> > In article <80ac50a0-dde2-4a66...@googlegroups.com>,
> > Michael S <already...@yahoo.com> wrote:
> > >On Tuesday, July 6, 2021 at 7:16:01 PM UTC+3, Branimir Maksimovic wrote:
> > >> On 2021-07-06, Michael S <already...@yahoo.com> wrote:
> > >> > I can believe that M1@3.2 GHz/Rosetta is able to run x64 software as
> > >fast as i3-8100B but have trouble believing that it could match i7-8700B
> > >either in single thread or in multithread throughput. Unless, of course,
> > >absolute majority of run time spent in native libraries.
> > >> You don't count that M1 is ~25-33% faster single core then any x86 :P
> > >
> > >I took it into account.
> > >
> > >Besides, while it's true for x86 CPUs in prev-gen Mac-Mini it's not true
> > >for *any* x86.
> > >M1 is slower than top Zen3 bins and about the same or a little slower
> > >than top Comet Lake.
> > >Probably somewhat slower than top Tiger Lake, but that comparison is
> > >rather close.
> > >Probably, measurably slower than top Rocket Lake, but I didn't look at
> > >Rocket Lake closely.
> > I have a Mac Mini M1, and it seems fast--very fast for some workloads (hard to
> > predict branches, or working set in the 100-200KB range). It is not the
> > fastest CPU on the planet, but it likely is the fastest laptop CPU. At < 10W
> > at the AC plug it compares pretty favorably to 60W CPUs. If you have a
> > relatively short benchmark (say, one file, C or C++, can be run from the Unix
> > command line, doesn't require me to install anything else, should run in less
> > than 5 minutes), I can compile it and run it for you, and then you can compare
> > those results to any system you like. I don't think comparing optimized AVX
> > is going to be useful, but simple integer or floating point algorithms would
> > be best.
> >
> > Kent
> The Euler-413 challenge that we discussed today in comp.lang.c for nDigits=11 should run in about 3 minutes.
> But who is going to test on the fastest Zen3 ?
> I have Ryzen 7 5800H at work, it's pretty fast and with single thread easily blows away 4.25GHz Skylake, but it's much slower than likes of 5900X.
>
> Code:
> #include <stdint.h>
> #include <stdio.h>
> #include <stdlib.h>
> #include <string.h>
>
> static unsigned long long oneChildsInRange(int nDigits);
> int main(int argz, char** argv)
> {
> if (argz < 2) {
> fprintf(stderr, "Usage:\n%s nDigits\n", argv[0]);
> return 1;
> }
>
> char* endp;
> int nDigits = strtol(argv[1], &endp, 0);
> if (endp == argv[1]) {
> fprintf(stderr, "Bad nDigits argument '%s'. Not a number.\n", argv[1]);
> return 1;
> }
>
> if (nDigits < 5 || nDigits > 19) {
> fprintf(stderr, "Please specify nDigits argument in range [5:19].\n");
> return 1;
> }
>
> printf("%2d %20llu\n", nDigits, oneChildsInRange(nDigits));
> return 0;
> }
>
> static int countChilds(const uint8_t *digits, int nDigits, const uint8_t *remTab)
> {
> int nChilds = 0;
> for (int beg = 0; beg < nDigits; ++beg) {
> unsigned r = 0;
> for (int end = beg; end < nDigits; ++end) {
> r = remTab[r*10+digits[end]];
> nChilds += (r == 0);
> }
> if (nChilds > 1)
> break; // we don't try to distinguish between cases of (nChilds > 1)
> }
> return nChilds;
> }
>
> static int countChilds2(
> const uint8_t *prefix, int prefixlen,
> const uint8_t *suffix, int suffixlen,
> const uint8_t *remTab,
> int nChilds0)
> {
> int nChilds = nChilds0;
> for (int prefix_i = 0; prefix_i < prefixlen; ++prefix_i) {
> unsigned r = prefix[prefix_i];
> for (int i = 0; i < suffixlen; ++i) {
> r = remTab[r*10+suffix[i]];
> nChilds += (r == 0);
> }
> if (nChilds > 1)
> break; // we don't try to distinguish between cases of (nChilds > 1)
> }
> return nChilds;
> }
>
> static void preprocessPrefix(uint8_t *dst, const uint8_t *src, int nDigits, const uint8_t *remTab)
> {
> for (int beg = 0; beg < nDigits; ++beg) {
> unsigned r = 0;
> for (int end = beg; end < nDigits; ++end)
> r = remTab[r*10+src[end]];
> dst[beg] = r;
> }
> }
>
>
> static unsigned long long intpow(unsigned base, int pow)
> {
> unsigned long long prod = 1;
> for (int k = 0; k < pow; ++k)
> prod *= base;
> return prod;
> }
>
> static void to_digits(uint8_t* dst, unsigned long long x, int nDigits)
> {
> for (int k = 0; k < nDigits; ++k) {
> dst[nDigits-1-k] = x % 10;
> x /= 10;
> }
> }
>
> static unsigned long long oneChildsInRange(int nDigits)
> {
> // initialize look-up table
> uint8_t remTab[200];
> for (int i = 0; i < nDigits*10; ++i)
> remTab[i] = i % nDigits;
>
> // initialize table of suffixes
> uint8_t suffixes[10000][4];
> int nSuff0 = 0;
> int nSuff1 = 0;
> for (int i = 0; i < 10000; ++i) {
> uint8_t suffix[4];
> to_digits(suffix, i, 4); // convert suffix to array of digits
> int nc = countChilds(suffix, 4, remTab);
> if (nc < 2) {
> if (nc == 0) {
> memcpy(suffixes[nSuff0], suffix, sizeof(suffixes[0]));
> ++nSuff0;
> } else { // nc==1
> memcpy(suffixes[9999-nSuff1], suffix, sizeof(suffixes[0])); // store starting from the end of array
> ++nSuff1;
> }
> }
> }
> if (nSuff1 > 0) // make suffixes[] array continuous
> memmove(suffixes[nSuff0], suffixes[10000-nSuff1], nSuff1*sizeof(suffixes[0]));
>
> unsigned long long cnt = 0;
> unsigned long long pref0 = intpow(10, nDigits-5);
> for (unsigned long long pref = pref0; pref < pref0*10; ++pref) {
> uint8_t prefix[20];
> to_digits(prefix, pref, nDigits-4); // convert prefix to array of digits
> int nc = countChilds(prefix, nDigits-4, remTab);
> if (nc < 2) {
> uint8_t processed_prefix[20];
> preprocessPrefix(processed_prefix, prefix, nDigits-4, remTab);
> for (int i = 0; i < nSuff0; ++i) // concatenate suffix with 0 children to prefix with 0 or 1 children
> cnt += (countChilds2(processed_prefix, nDigits-4, suffixes[i], 4, remTab, nc)==1);
> if (nc == 0) {
> for (int i = nSuff0; i < nSuff0+nSuff1; ++i) // concatenate suffix with 1 child to prefix with 0 children
> cnt += (countChilds2(processed_prefix, nDigits-4, suffixes[i], 4, remTab, 1)==1);
> }
> }
> }
>
> return cnt;
> }

In the mean time I simplified and improved this program.
With new variant nDigits=11 is no longer interesting as a benchmark (too fast), but nDigits=12 and nDigits=13 are now well suited.
On my Xeon-E they took, respectively, 9m47.099s and 1m50.575s
Code:

//-- beg
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

static unsigned long long oneChildsInRange(int nDigits);
int main(int argz, char** argv)
{ if (argz < 2) {
fprintf(stderr, "Usage:\n%s nDigits\n", argv[0]);
return 1;
}

char* endp;
int nDigits = strtol(argv[1], &endp, 0);
if (endp == argv[1]) {
fprintf(stderr, "Bad nDigits argument '%s'. Not a number.\n", argv[1]);
return 1;
}

if (nDigits < 11 || nDigits > 19) {
fprintf(stderr, "Please specify nDigits argument in range [11:19].\n");
return 1;
}

printf("%2d %20llu\n", nDigits, oneChildsInRange(nDigits));
return 0;
}

typedef struct {
uint8_t isChildTab[19+10]; // [i] = i % nDigits == 0
uint8_t x10remTab [19+10]; // [i] = (10*i) % nDigits
} tabs_t;

static unsigned long long countChildsRecursive(
int prefix_nChilds, // 0 or 1
const uint8_t prefixRem[],
int prefixlen,
const tabs_t* tabs)
{ unsigned long long cnt = 0;
for (int suffix = prefix_nChilds; suffix < 10; ++suffix) {
int nChilds = suffix ? prefix_nChilds : 1;
const uint8_t *isChild = &tabs->isChildTab[suffix];
for (int i = 0; i < prefixlen; ++i)
nChilds += isChild[prefixRem[i]];

if (nChilds < 2) {
if (tabs->isChildTab[prefixlen+1]) { // all digits processed
cnt += nChilds;
} else {
// extend prefix
uint8_t prefixRemEx[20];
for (int i = 0; i < prefixlen; ++i)
prefixRemEx[i] = tabs->x10remTab[prefixRem[i]+suffix];
prefixRemEx[prefixlen] = tabs->x10remTab[suffix];
cnt += countChildsRecursive(nChilds, prefixRemEx, prefixlen+1, tabs);
}
}
}
return cnt;
}

static unsigned long long oneChildsInRange(int nDigits)
{ // initialize look-up tables
tabs_t tabs;
for (int i = 0; i < 19+10; ++i) {
tabs.isChildTab[i] = i % nDigits == 0;
tabs.x10remTab [i] = (i*10) % nDigits;
}

unsigned long long cnt = 0;
for (int pref = 1; pref < 10; ++pref) {
uint8_t prefixRem[1];
prefixRem[0] = tabs.x10remTab[pref];
cnt += countChildsRecursive(0, prefixRem, 1, &tabs);
}

return cnt;
}

//-- end

Unlike the previous code, this variant on x86-64 is faster when compile with 'clang -march=native -O2'.
gcc is significantly slower.

SubjectRepliesAuthor
o My experience with Apple M1 chip

By: Branimir Maksimovic on Sun, 4 Jul 2021

73Branimir Maksimovic
server_pubkey.txt

rocksolid light 0.9.81
clearnet tor