Home > Archive > Fortran > March 2007 > Re: Is C faster than fortran?
You are viewing an archived Text-only version of the thread.
To view this thread in it's original format and/or if you want to reply to
this thread please [click here]
| Author |
Re: Is C faster than fortran?
|
|
| user923005 2007-03-27, 7:09 pm |
| On Mar 27, 8:41=C2=A0am, p...@see.signature.invalid (Pierre Asselin) wrote:
> Noma...@gmail.com <Noma...@gmail.com> wrote:
>
> I think you can quickly learn enough Fortran at a superficial level
> to do some number crunching. =C2=A0Just be good and use "implicit none".
>
> Since you already know C, I would say: write your first project in C
> but learn enough Fortran to translate it. =C2=A0Make sure the answers
> are the same, then race the two and post the results here :-)
>
> Seriously. =C2=A0Fortran compilers can optimize more aggressively than
> C because the language semantics are different. =C2=A0C99 plugs this
> gap (mostly) with the "restrict" qualifier, but I don't know how
> that plays out in practice and I would love to see data.
Using:
http://gcc.gnu.org/ml/fortran/2005-...29/TEST_FPU.f90
Via:
dcorbit@DCORBIT64 /f/tmp
$ g95 -O3 -Wall test_fpu.f90
In file test_fpu.f90:83
91 FORMAT (A,I4,2('/',I2.2))
1
Warning (110): Label 91 at (1) defined but not used
In file test_fpu.f90:2293
INTEGER :: i , info , j , l , ncola , nrowa , nrowb
1
Warning (112): Variable 'ncola' at (1) is set but never used
test_fpu.f90: In function 'dtrmv_':
test_fpu.f90:3611: warning: 'kx' may be used uninitialized in this
function
dcorbit@DCORBIT64 /f/tmp
$ ./a
Benchmark running, hopefully as only ACTIVE task
Test1 - Gauss 2000 (101x101) inverts 15.1 sec Err=3D 0.000000000000002
Test2 - Crout 2000 (101x101) inverts 7.4 sec Err=3D 0.000000000000000
Test3 - Crout 2 (1001x1001) inverts 13.8 sec Err=3D 0.000000000000005
Test4 - Lapack 2 (1001x1001) inverts 10.0 sec Err=3D 0.000000000000417
total =3D 46.3 sec
dcorbit@DCORBIT64 /f/tmp
$ g95 --version
G95 (GCC 4.1.2 (g95 0.91!) Mar 21 2007)
Copyright (C) 2002-2005 Free Software Foundation, Inc.
G95 comes with NO WARRANTY, to the extent permitted by law.
You may redistribute copies of G95
under the terms of the GNU General Public License.
For more information about these matters, see the file named COPYING
Using:
/*
* --------------------------------------------------------------
* TEST_FPU A number-crunching benchmark using matrix inversion.
* Implemented by: David Frank DaveGemini@aol.com
* Gauss routine by: Tim Prince N8TM@aol.com
* Crout routine by: James Van Buskirk torsop@ix.netcom.com
* F90->C source by: Sergey N. Voronkov serg@ggd.nsu.ru
* Pointer exchange version by: Dieter Buerssner buers@gmx.de
* --------------------------------------------------------------
*/
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>
#include <time.h>
/*
* Compiling with NI =3D 1001 (default) generates pool(51,51,1000) =3D
20mb.
* If test system pages due to insufficient memory (disk i/o activity
* occurs), abort run and compile with NI =3D 200, benchmark will
adjust
* time x 5.
*/
#define NI 1001
#define NN 51
#define RK8 double
/* below are additional C routines supplied by translator */
void memflt()
{
fputs("Memory allocation error\n", stderr);
exit(EXIT_FAILURE);
}
void alloc_arrays(RK8 ** p[NI], RK8 *** a, RK8 *** b)
{
int i,
j;
for (i =3D 0; i < NI; i++) {
if ((p[i] =3D (RK8 **) malloc(NN * sizeof(RK8 *))) =3D=3D NULL)
memflt();
for (j =3D 0; j < NN; j++)
if ((p[i][j] =3D (RK8 *) malloc(NN * sizeof(RK8))) =3D=3D NULL)
memflt();
}
if ((*a =3D (RK8 **) malloc(NN * sizeof(RK8 *))) =3D=3D NULL ||
(*b =3D (RK8 **) malloc(NN * sizeof(RK8 *))) =3D=3D NULL)
memflt();
for (i =3D 0; i < NN; i++)
if (((*a)[i] =3D (RK8 *) malloc(NN * sizeof(RK8))) =3D=3D NULL ||
((*b)[i] =3D (RK8 *) malloc(NN * sizeof(RK8))) =3D=3D NULL)
memflt();
}
void random_number(RK8 ** pool[NI])
{
int i,
j,
k;
for (i =3D 0; i < NI; i++)
for (j =3D 0; j < NN; j++)
for (k =3D 0; k < NN; k++)
pool[i][j][k] =3D (RK8) (rand()) / RAND_MAX;
}
RK8
timesec()
{
return (RK8) (clock()) / CLOCKS_PER_SEC;
}
/* prototype the invert functions that follow exec source */
void Gauss(RK8 ** a, RK8 ** b, int n);
void Crout(RK8 ** a, RK8 ** b, int n);
int rgaussi(RK8 ** a, RK8 ** b, int n);
int main()
{
RK8 **pool[NI]; /* pool of matrices to invert */
RK8 **a,
**ai; /* working matrices use < 256k */
RK8 avg_err,
total_time,
time1;
int i,
j,
n;
char *revision =3D "01/10/98"; /* Gauss speedup mod */
char invert_id[3][8] =3D
{
"Gauss", "Crout", "Dieter"};
struct tm *ptm;
time_t crtime;
FILE *fp;
/* Begin by allocating matrix arrays */
alloc_arrays(pool, &a, &ai);
puts("Benchmark running, hopefully as only ACTIVE task");
if ((fp =3D fopen("test_fpc.dat", "w")) =3D=3D NULL) {
fprintf(stderr, "Can't open output file!\n");
return EXIT_FAILURE;
}
crtime =3D time(NULL);
ptm =3D gmtime(&crtime);
fprintf(fp, "Date run =3D %2d/%2d/%2d\n",
ptm->tm_mon + 1, ptm->tm_mday, ptm->tm_year);
fputs("Pls supply info below, send to DaveGemini@aol.com\n"
"Tester name/ net address =3D \n"
"CPU mfg/id/Mhz =3D \n"
"MEM/CACHE =3D \n"
"O.S. / COMPILER =3D \n"
"Additional comments =3D \n\n\n\n\n", fp);
fprintf(fp, "Results for %s revision using TEST_FPU.C \n",
revision);
srand(time(NULL)); /* set seed to random number based on time */
random_number(pool); /* fill pool with random data ( 0. -> 1. ) */
for (n =3D 0; n < 3; n++) { /* for Gauss,Crout algorithms */
time1 =3D timesec(); /* start benchmark n time */
for (i =3D 0; i < NI; i++) {
/* get next matrix to invert */
for (j =3D 0; j < NN; j++)
memcpy(a[j], pool[i][j], sizeof(RK8) * NN);
switch (n) {
case 0:
Gauss(a, ai, NN); /* invert a -> ai ; destructs a */
Gauss(ai, a, NN); /* invert ai -> a */
break;
case 1:
Crout(a, ai, NN); /* invert a -> ai ; nondestructs a
*/
Crout(ai, a, NN); /* invert ai -> a */
break;
case 2:
rgaussi(a, ai, NN); /* invert a -> ai ; nondestructs a
*/
rgaussi(ai, a, NN); /* invert ai -> a */
break;
}
}
total_time =3D timesec() - time1; /* =3D elapsed time sec. */
/* check accuracy last matrix invert. */
avg_err =3D 0;
for (i =3D 0; i < NN; i++)
for (j =3D 0; j < NN; j++)
avg_err +=3D fabs(a[i][j] - pool[NI - 1][i][j]);
if (NI =3D=3D 200)
fprintf(fp, "\n%s 5 x 200 x 2 inverts =3D %6.1f sec.\n",
invert_id[n], 5 * total_time);
else
fprintf(fp, "\n%s 1000 x 2 inverts =3D %6.1f sec.\n",
invert_id[n], total_time);
fputs("Accuracy of 2 computed numbers\n", fp);
fprintf(fp, "Original =3D %18.15f %18.15f\n",
pool[NI - 1][0][0], pool[NI - 1][NN - 1][NN - 1]);
fprintf(fp, "Computed =3D %18.15f %18.15f\n",
a[0][0], a[NN - 1][NN - 1]);
fprintf(fp, "Avg Err. =3D %18.15f\n", avg_err / (NN * NN));
} /* for Gauss,Crout algorithms */
puts("Results written to: TEST_FPC.DAT");
return EXIT_SUCCESS;
}
/*
* --------------------------------------
* Invert matrix a -> b by Gauss method
* --------------------------------------
*/
void Gauss(RK8 ** a, RK8 ** b, int n)
{
RK8 d,
temp =3D 0,
c;
int i,
j,
k,
m,
nn,
*ipvt;
if ((ipvt =3D (int *) malloc(n * sizeof(int))) =3D=3D NULL)
memflt();
nn =3D n;
for (i =3D 0; i < nn; i++)
ipvt[i] =3D i;
for (k =3D 0; k < nn; k++) {
temp =3D 0.;
m =3D k;
for (i =3D k; i < nn; i++) {
d =3D a[k][i];
if (fabs(d) > temp) {
temp =3D fabs(d);
m =3D i;
}
}
if (m !=3D k) {
j =3D ipvt[k];
ipvt[k] =3D ipvt[m];
ipvt[m] =3D j;
for (j =3D 0; j < nn; j++) {
temp =3D a[j][k];
a[j][k] =3D a[j][m];
a[j][m] =3D temp;
}
}
d =3D 1 / a[k][k];
for (j =3D 0; j < k; j++) {
c =3D a[j][k] * d;
for (i =3D 0; i < nn; i++)
a[j][i] -=3D a[k][i] * c;
a[j][k] =3D c;
}
for (j =3D k + 1; j < nn; j++) {
c =3D a[j][k] * d;
for (i =3D 0; i < nn; i++)
a[j][i] -=3D a[k][i] * c;
a[j][k] =3D c;
}
for (i =3D 0; i < nn; i++)
a[k][i] =3D -a[k][i] * d;
a[k][k] =3D d;
}
for (i =3D 0; i < nn; i++)
memcpy(b[ipvt[i]], a[i], sizeof(RK8) * nn);
free(ipvt);
}
/*
* --------------------------------------
* Invert matrix a -> b by Crout method
* --------------------------------------
*/
void Crout(RK8 ** a, RK8 ** b, int n)
{
int i,
j; /* Current row & column */
int maxlok; /* Location of maximum pivot */
int *index; /* Partial pivot record */
RK8 *temp =3D 0,
the_max;
RK8 tmp,
*ptr;
RK8 *matr =3D 0;
int k,
ind,
ind2;
if ((index =3D (int *) malloc(n * sizeof(int))) =3D=3D NULL ||
(temp =3D (RK8 *) malloc(n * sizeof(RK8))) =3D=3D NULL ||
(matr =3D (RK8 *) malloc(n * n * sizeof(RK8))) =3D=3D NULL)
memflt();
/* Initialize everything */
for (i =3D 0; i < n; i++)
index[i] =3D i;
/* Shuffle matrix */
for (j =3D 0; j < n; j++) {
for (i =3D 0; i < j; i++)
b[j][i] =3D a[j][i];
for (i =3D j; i < n; i++)
b[j][i] =3D a[i - j][n - j - 1];
}
/* LU decomposition; reciprocals of diagonal elements in L matrix */
for (j =3D 0; j < n; j++) {
/* Get current column of L matrix */
for (i =3D j; i < n; i++) {
tmp =3D 0;
ind =3D n - i - 1;
for (k =3D 0; k < j; k++)
tmp +=3D b[ind][ind + k] * b[j][k];
b[ind][ind + j] -=3D tmp;
}
maxlok =3D 0;
the_max =3D fabs(b[0][j]);
for (i =3D 1; i < n - j; i++)
if (fabs(b[i][j + i]) >=3D the_max) {
the_max =3D fabs(b[i][j + i]);
maxlok =3D i;
}
/* det =3D det*b(j+maxlok-1,maxlok) */
b[maxlok][j + maxlok] =3D 1 / b[maxlok][j + maxlok];
/* Swap biggest element to current pivot position */
if (maxlok + 1 !=3D n - j) {
ind =3D n - maxlok - 1;
ind2 =3D index[j];
index[j] =3D index[ind];
index[ind] =3D ind2;
for (k =3D n - maxlok; k < n; k++) {
tmp =3D b[k][j];
b[k][j] =3D b[k][ind];
b[k][ind] =3D tmp;
}
memcpy(temp, &(b[maxlok][maxlok]), sizeof(RK8) * (n -
maxlok));
ptr =3D &(b[n - j - 1][n - j - 1]);
memmove(&(b[maxlok][maxlok]), ptr, sizeof(RK8) * (j + 1));
for (k =3D j + 1; k < n - maxlok; k++)
b[maxlok][maxlok + k] =3D b[k][j];
memcpy(ptr, temp, (j + 1) * sizeof(RK8));
for (k =3D j + 1; k < n - maxlok; k++)
b[k][j] =3D temp[k];
}
/* Get current row of U matrix */
ind =3D n - j - 1;
for (i =3D j + 1; i < n; i++) {
tmp =3D 0.;
for (k =3D 0; k < j; k++)
tmp +=3D b[i][k] * b[ind][ind + k];
b[i][j] =3D b[ind][n - 1] * (b[i][j] - tmp);
}
} /* END DO LU_outer */
/* Invert L matrix */
for (j =3D 0; j < n - 1; j++) {
temp[0] =3D b[n - j - 1][n - 1];
for (i =3D j + 1; i < n; i++) {
ind =3D n - i - 1;
tmp =3D 0.;
for (k =3D 0; k < i - j; k++)
tmp +=3D temp[k] * b[ind][ind + j + k];
b[ind][ind + j] =3D -tmp * b[ind][n - 1];
temp[i - j] =3D b[ind][ind + j];
}
}
/* Reshuffle matrix */
for (i =3D 0; i < (n + 1) / 3; i++) {
memcpy(temp, &(b[i][2 * (i + 1) - 1]), sizeof(RK8) * (n + 2 -
3 * (i +1)));
for (j =3D 2 * (i + 1) - 1; j < n - i; j++)
b[i][j] =3D b[n - j - 1][n - j + i - 1];
ind =3D n - i - 1;
for (j =3D i; j < n + 1 - 2 * (i + 1); j++)
b[j][i + j] =3D b[n - i - j - 1][ind];
for (k =3D 0; k < n + 2 - 3 * (i + 1); k++)
b[i + 1 + k][ind] =3D temp[k];
}
/* Invert U matrix */
for (i =3D 0; i < n - 1; i++) {
for (j =3D i + 1; j < n; j++) {
tmp =3D 0.;
for (k =3D 0; k < j - i - 1; k++)
tmp +=3D temp[k] * b[j][i + 1 + k];
b[j][i] =3D -b[j][i] - tmp;
temp[j - i - 1] =3D b[j][i];
}
}
/* Multiply inverses in reverse order */
for (i =3D 0; i < n - 1; i++) {
for (k =3D 0; k < n - i - 1; k++)
temp[k] =3D b[i + 1 + k][i];
for (j =3D 0; j <=3D i; j++) {
tmp =3D 0.;
for (k =3D 0; k < n - i - 1; k++)
tmp +=3D temp[k] * b[j][i + 1 + k];
b[j][i] +=3D tmp;
}
for (j =3D i + 1; j < n; j++) {
tmp =3D 0.;
for (k =3D j; k < n; k++)
tmp +=3D temp[k - i - 1] * b[j][k];
b[j][i] =3D tmp;
}
}
/* Straighten out the columns of the result */
for (i =3D 0; i < n; i++)
memcpy(matr + n * i, b[i], sizeof(RK8) * n);
for (i =3D 0; i < n; i++)
memcpy(b[index[i]], matr + n * i, sizeof(RK8) * n);
free(index);
free(temp);
free(matr);
}
/*
** This routine is due to buers@gmx.de (Dieter Buerssner)
** Destroys a, return 0: success, 1: zero pivot, 2: out of mem.
*/
int rgaussi(RK8 ** a, RK8 ** b, int n)
{
int i,
j,
k,
maxj,
t;
RK8 maxel,
pivinv,
tmaxel,
aji;
RK8 *tp,
*ai,
*aj;
/* C99: int ipiv[n]; */
int *ipiv =3D malloc(n * sizeof *ipiv);
if (ipiv =3D=3D NULL)
return 2;
for (i =3D 0; i < n; i++)
ipiv[i] =3D i;
for (i =3D 0; i < n; i++) {
maxj =3D -1;
maxel =3D 0.0;
/* find pivot element */
for (j =3D i; j < n; j++)
if ((tmaxel =3D fabs(a[j][i])) > maxel) {
maxj =3D j;
maxel =3D tmaxel;
}
if (maxj < 0) {
free(ipiv);
return 1;
}
/* exchange rows */
if (maxj !=3D i) {
/* just exchange pointers for a */
tp =3D a[maxj];
a[maxj] =3D a[i];
a[i] =3D tp;
t =3D ipiv[maxj];
ipiv[maxj] =3D ipiv[i];
ipiv[i] =3D t;
}
ai =3D a[i];
pivinv =3D 1.0 / ai[i];
ai[i] =3D 1.0;
for (k =3D 0; k < n; k++)
ai[k] *=3D pivinv;
for (j =3D 0; j < n; j++) {
if (j !=3D i) {
aj =3D a[j];
aji =3D aj[i];
aj[i] =3D 0.0;
for (k =3D 0; k < n; k++)
aj[k] -=3D aji * ai[k];
}
}
}
for (i =3D 0; i < n; i++)
for (j =3D 0; j < n; j++)
b[i][ipiv[j]] =3D a[i][j];
free(ipiv);
return 0;
}
via:
CL /Ox /Ob2 /Oi /Ot /Oy /GT /GL /D "WIN32" /D "NDEBUG" /D "_CONSOLE" /
D "_UNICODE" /D "UNICODE" /FD /MD /Zp16 /fp:fast /Fo"Release\\" /
Fd"Release\vc80.pdb" /W4 /nologo /c /Wp64 /Zi /Gr /TP /wd4996 /
errorReport:prompt
Date run =3D 3/27/107
Pls supply info below, send to DaveGemini@aol.com
Tester name/ net address =3D {Dann Corbit/dcorbit@connx.com}
CPU mfg/id/Mhz =3D
CPU Identification utility v1.11 (c) 1997-2005 Jan
Steunebrink
=20
=E2=94=80=E2=94=80=E2=94=80=E2=94=80=E2=
94=80=E2=94=80=E2=94=80=E2=94=80=E2=
=94=80=E2=94=80=E2=94=80=E2=94=80=E2=94=
80=E2=94=80=E2=94=80=E2=94=80=E2=94=
=80=E2=94=80=E2=94=80=E2=94=80=E2=94=80=
E2=94=80=E2=94=80=E2=94=80=E2=94=80=
=E2=94=80=E2=94=80=E2=94=80=E2=94=80=E2=
94=80=E2=94=80=E2=94=80=E2=94=80=E2=
=94=80=E2=94=80=E2=94=80=E2=94=80=E2=94=
80=E2=94=80=E2=94=80=E2=94=80=E2=94=
=80=E2=94=80=E2=94=80=E2=94=80=E2=94=80=
E2=94=80=E2=94=80=E2=94=80=E2=94=80=
=E2=94=80=E2=94=80=E2=94=80=E2=94=80=E2=
94=80=E2=94=80=E2=94=80=E2=94=80=E2=
=94=80=E2=94=80=E2=94=80=E2=94=80=E2=94=
80=E2=94=80=E2=94=80=E2=94=80=E2=94=
=80=E2=94=80=E2=94=80=E2=94=80=E2=94=80=
E2=94=80=E2=94=80=E2=94=80=E2=94=80=
=E2=94=80=E2=94=80=E2=94=80
CPU Vendor and Model: AMD Athlon 64 2800+-3700+
Internal CPU speed : 2199.4 MHz (using internal Time Stamp Counter)
Clock Multiplier : Available only in Real Mode!
CPU-ID Vendor string: AuthenticAMD
CPU-ID Name string : AMD Athlon(tm) 64 Processor 3400+
CPU-ID Signature : 0F4A
=E2=94=82=E2=94=82=E2=94=82=E2=94=94=E2=
94=80 Steppi=
ng or sub-model no.
=E2=94=82=E2=94=82=E2=94=94=E2=94=80 Model: Indicate=
s CPU Model and 486 L1
cache mode
=E2=94=82=E2=94=94=E2=94=80 Family: 4=3D486, Am5x86,=
Cx5x86
=E2=94=82 5=3DPentium, Nx586, Cx6x86, K5/K=
6,
C6, mP6
=E2=94=82 6=3DPentiumPro/II/III, CxMII/III,
Athlon, C3
=E2=94=82 F=3DPentium4, Athlon64
=E2=94=94=E2=94=80 Type: 0=3DStandard, 1=3DOverdrive=
, 2=3D2nd Dual
Pentium
Current CPU mode : Protected
Internal (L1) cache : Enabled in Write-Back mode
MEM/CACHE =3D 2GB physical RAM
O=2ES. / COMPILER =3D Windows 2003 / Microsoft (R) 32-bit C/C++ Optimizing
Compiler Version 14.00.50727.42 for 80x86
Additional comments =3D
Results for 01/10/98 revision using TEST_FPU.C
Gauss 1000 x 2 inverts =3D 0.4 sec.
Accuracy of 2 computed numbers
Original =3D 0.753715628528703 0.364574114200262
Computed =3D 0.753715628528701 0.364574114200264
Avg Err. =3D 0.000000000000001
Crout 1000 x 2 inverts =3D 0.7 sec.
Accuracy of 2 computed numbers
Original =3D 0.753715628528703 0.364574114200262
Computed =3D 0.753715628528701 0.364574114200266
Avg Err. =3D 0.000000000000003
Dieter 1000 x 2 inverts =3D 0.4 sec.
Accuracy of 2 computed numbers
Original =3D 0.753715628528703 0.364574114200262
Computed =3D 0.753715628528703 0.364574114200264
Avg Err. =3D 0.000000000000002
We need a Fortran Guru to show me what I am doing wrong, because there
is no feasible explanation for this other than I do not know how to
get performance out of my Fortran compiler.
| |
| glen herrmannsfeldt 2007-03-27, 10:09 pm |
| user923005 wrote:
> On Mar 27, 8:41 am, p...@see.signature.invalid (Pierre Asselin) wrote:
(snip)
(snip)
[color=darkred]
> We need a Fortran Guru to show me what I am doing wrong, because there
> is no feasible explanation for this other than I do not know how to
> get performance out of my Fortran compiler.
That is what they say, and maybe on the average it is true.
There are too many variables to make it true in general.
First, the programs should be as similar as possible, and also
the data. In the (snipped) programs, the C version uses arrays of
pointers to arrays of pointers to arrays of pointers, which is not
likely true for Fortran, and might make a difference either way.
The programs are doing matrix inversion of random numbers, which
are generated using system dependent generators. There may be some
data dependence to the timing of matrix inversion, especially
if the hardware timing is different for different data.
Optimization is probably more sensitive to the number of registers
available than to the ability to rearrange expressions.
Do both Fortran and C programs have the array subscripts in the
optimal order? Most compilers won't change that.
-- glen
| |
| Lane Straatman 2007-03-27, 10:09 pm |
|
"glen herrmannsfeldt" <gah@ugcs.caltech.edu> wrote in message
news:gPWdnZ0Jt4DDIZTbnZ2dnUVZ_hKdnZ2d@co
mcast.com...
> user923005 wrote:
[follow-up set to c.l.f.]
>
>
> That is what they say, and maybe on the average it is true.
>
> There are too many variables to make it true in general.
> First, the programs should be as similar as possible, and also
> the data. In the (snipped) programs, the C version uses arrays of
> pointers to arrays of pointers to arrays of pointers, which is not
> likely true for Fortran, and might make a difference either way.
>
> The programs are doing matrix inversion of random numbers, which
> are generated using system dependent generators. There may be some
> data dependence to the timing of matrix inversion, especially
> if the hardware timing is different for different data.
>
> Optimization is probably more sensitive to the number of registers
> available than to the ability to rearrange expressions.
> Do both Fortran and C programs have the array subscripts in the
> optimal order? Most compilers won't change that.
This OP is well-known for his numerical skills in C. I can't speak to
optimizing execution speed. If OP would like to see a couple examples from
MR&C on array features, I'll be happy to donate some keystrokes.
--
LS
| |
| Pierre Asselin 2007-03-30, 4:08 am |
| In comp.lang.c user923005 <dcorbit@connx.com> wrote:
> Using:
> http://gcc.gnu.org/ml/fortran/2005-...29/TEST_FPU.f90
> Via:
> dcorbit@DCORBIT64 /f/tmp
> $ g95 -O3 -Wall test_fpu.f90
> [ ... ]
> dcorbit@DCORBIT64 /f/tmp
> $ ./a
> Benchmark running, hopefully as only ACTIVE task
> Test1 - Gauss 2000 (101x101) inverts 15.1 sec Err= 0.000000000000002
> Test2 - Crout 2000 (101x101) inverts 7.4 sec Err= 0.000000000000000
> Test3 - Crout 2 (1001x1001) inverts 13.8 sec Err= 0.000000000000005
> Test4 - Lapack 2 (1001x1001) inverts 10.0 sec Err= 0.000000000000417
> total = 46.3 sec
These numbers are quite reasonable. At O(n^3) operations per matrix
inversion, they correspond to 136, 278, 145 and 201 megaflops. I
didn't do an accurate flop count, just 2000*101^3 ops for the first
two and 2*1001^3 for the last two.
> Using:
> [ similar C program omitted, except: ]
> [ #define NI 1001 ] --number of repeats
> [ #define NN 51 ] --matrix size
> via:
> CL /Ox /Ob2 /Oi /Ot /Oy /GT /GL /D "WIN32" /D "NDEBUG" /D "_CONSOLE" /
> D "_UNICODE" /D "UNICODE" /FD /MD /Zp16 /fp:fast /Fo"Release\\" /
> Fd"Release\vc80.pdb" /W4 /nologo /c /Wp64 /Zi /Gr /TP /wd4996 /
> errorReport:prompt
> Results for 01/10/98 revision using TEST_FPU.C
> Gauss 1000 x 2 inverts = 0.4 sec.
> Accuracy of 2 computed numbers
> Original = 0.753715628528703 0.364574114200262
> Computed = 0.753715628528701 0.364574114200264
> Avg Err. = 0.000000000000001
> Crout 1000 x 2 inverts = 0.7 sec.
> Accuracy of 2 computed numbers
> Original = 0.753715628528703 0.364574114200262
> Computed = 0.753715628528701 0.364574114200266
> Avg Err. = 0.000000000000003
> Dieter 1000 x 2 inverts = 0.4 sec.
> Accuracy of 2 computed numbers
> Original = 0.753715628528703 0.364574114200262
> Computed = 0.753715628528703 0.364574114200264
> Avg Err. = 0.000000000000002
> We need a Fortran Guru to show me what I am doing wrong, because there
> is no feasible explanation for this other than I do not know how to
> get performance out of my Fortran compiler.
From a cursory inspection of the C, these are smaller systems: 2002
inversions of 51x51 matrices compared to the 2000 101x101 and 2
1001x1001 of the Fortran example. The numbers correspond to ~332,
190 and 332 megaflops, but the accuracy may not be so good because
of the short times.
On the Fortran code, I get
gfortran -o fortran -O3 -funroll-loops TEST_FPU.f90
../fortran
Benchmark running, hopefully as only ACTIVE task
Test1 - Gauss 2000 (101x101) inverts 6.6 sec Err= 0.000000000000003
Test2 - Crout 2000 (101x101) inverts 7.1 sec Err= 0.000000000000000
Test3 - Crout 2 (1001x1001) inverts 22.9 sec Err= 0.000000000000001
Test4 - Lapack 2 (1001x1001) inverts 20.8 sec Err= 0.000000000000273
total = 57.3 sec
Approx 312, 290, 87.6 and 96.4 megaflops (on a little laptop, must
be spilling out of cache).
I translated Gauss() to C by hand but not the other two (especially
not Lapack() !). I'll append my code below. I basically transliterated
the algorithm, using C99 VLA's instead of arrays of pointers. I
also transposed all the matrices so the memory access patterns
would be similar in C and in Fortran; this is ok mathematically,
matrix inversion commutes with transposition. I added a unit test,
not shown, to check that multiplying a matrix by the alleged inverse
returned by Gauss() yields the identity matrix.
I get:
cc -o C -std=c99 -O3 -funroll-loops -Wall TEST_FPU.c
../C
Test1 - Gauss 2000 (101x101) inverts 6.415025 sec Err=0.000000
--rest not implemented--
so comparable to Fortran. Maybe a smidge faster, in fact.
Here is the C version of Gauss(), followed by the original
Fortran. Oh, and the "restrict" keyword doesn't make one
bit of difference.
****************************************
******************************
static inline int isamax(int n, double v[n])
{
int i, imax;
double tmp, max;
for(i= 1, imax= 0, max= fabs(v[0]); i<n; i++) {
tmp= fabs(v[i]);
if(tmp>max) { max= tmp; imax= i; }
}
return imax;
}
static void Gauss(int n, double a[n][n])
{
double (* restrict b)[n]= malloc(n*sizeof(b[0]));
double * restrict temp= malloc(n*sizeof(temp[0]));
int * restrict piv= malloc(n*sizeof(piv[0]));
double c, d;
int i, j, k, m, jmax;
memcpy(b, a, n*sizeof(b[0]));
for(i= 0; i<n; i++) piv[i]= i;
for(k= 0; k<n; k++) {
/* largest element in b[k][k:n] */
jmax= k+isamax(n-k, &b[k][k]);
if(jmax != k) {
/* swap columns k and jmax */
m= piv[k]; piv[k]= piv[jmax]; piv[jmax]= m;
for(i= 0; i<n; i++) {
c= b[i][k];
b[i][k]= b[i][jmax];
b[i][jmax]= c;
}
}
d= 1./b[k][k];
memcpy(temp, b[k], sizeof(b[k]));
for(i= 0; i<n; i++) {
c= b[i][k]*d;
for(j= 0; j<n; j++) {
b[i][j]-= temp[j]*c;
}
b[i][k]= c;
}
for(j= 0; j<n; j++) b[k][j]= -d*temp[j];
b[k][k]= d;
}
for(i= 0; i<n; i++) for(j= 0; j<n; j++) {
a[piv[i]][j]= b[i][j];
}
free(piv); free(temp); free(b);
}
****************************************
******************************
MODULE kinds
INTEGER, PARAMETER :: RK8 = SELECTED_REAL_KIND(15, 300)
END MODULE kinds
SUBROUTINE Gauss (a,n) ! Invert matrix by Gauss method
! --------------------------------------------------------------------
USE kinds
IMPLICIT NONE
INTEGER :: n
REAL(RK8) :: a(n,n)
! - - - Local Variables - - -
REAL(RK8) :: b(n,n), c, d, temp(n)
INTEGER :: i, j, k, m, imax(1), ipvt(n)
! - - - - - - - - - - - - - -
b = a
ipvt = (/ (i, i = 1, n) /)
DO k = 1,n
imax = MAXLOC(ABS(b(k:n,k)))
m = k-1+imax(1)
IF (m /= k) THEN
ipvt( (/m,k/) ) = ipvt( (/k,m/) )
b((/m,k/),:) = b((/k,m/),:)
END IF
d = 1/b(k,k)
temp = b(:,k)
DO j = 1, n
c = b(k,j)*d
b(:,j) = b(:,j)-temp*c
b(k,j) = c
END DO
b(:,k) = temp*(-d)
b(k,k) = d
END DO
a(:,ipvt) = b
END SUBROUTINE Gauss
****************************************
******************************
--
pa at panix dot com
|
|
|
|
|