|
Description
|
FULL PRODUCT VERSION :
java version "1.4.0"
Java(TM) 2 Runtime Environment, Standard Edition (build 1.4.0-b92)
Java HotSpot(TM) Server VM (build 1.4.0-b92, mixed mode)
also:
java version "1.4.1"
Java(TM) 2 Runtime Environment, Standard Edition (build 1.4.1-b21)
Java HotSpot(TM) Server VM (build 1.4.1-b21, mixed mode)
FULL OPERATING SYSTEM VERSION : Microsoft Windows 2000 [Version 5.00.2195]
ADDITIONAL OPERATING SYSTEMS : Solaris (just slow not erratic)
DESCRIPTION OF THE PROBLEM :
Windows
=======
The server VM on Windows gives erratic and slow behaviour. The program given below runs a benchmark 3 times and reports the MFLOPS for each run. For the client VM on my machine I consistently get 36 MFLOPS for each run. For the
server VM the results are inconsistent between each execution and inconsistent between each of the 3 runs in an execution, e.g.
Execution 1: 14, 24, and 5 MFLOPS
Execution 2: 26, 4, and 5 MFLOPS
The problem is also reproducible on Windows 2000 using JDK1.4.1.
The server VM is always much slower than the client VM, as shown in the results above.
Solaris
=======
No erratic behaviour for the server VM, just slow. Typically a factor of 10 slower than the client VM.
Note
====
Not related to use of double numbers, same if float used.
STEPS TO FOLLOW TO REPRODUCE THE PROBLEM :
Run the code below twice with both the client and server VMs, total of 4 runs, and compare the outputs.
REPRODUCIBILITY :
This bug can be reproduced always.
---------- BEGIN SOURCE ----------
package benchmarks.shallow;
public final class Shallow {
public static void main(final String[] args) {
/*
Benchmark weather prediction program for comparing the
performance of current supercomputers. The model is
based on the paper - The dynamics of finite-difference
models of the shallow water equations, by Robert Sadourny
J. Atmos. Sci. Vol 32, No 4, April 1975
Code by Paul N. Swartzrauber, NCAR, Oct 1984
This version of the code taken from Hoffman et al., 1988
Aspects of Using Multiprocessors for Meteorological Modelling
in "Multiprocessing in Meteorological Models"
*/
int n=256, m=n;
int np1=n+1, mp1=m+1;
double [][] u = new double[np1][mp1];
double [][] v = new double[np1][mp1];
double [][] p = new double[np1][mp1];
double [][] unew = new double[np1][mp1];
double [][] vnew = new double[np1][mp1];
double [][] pnew = new double[np1][mp1];
double [][] uold = new double[np1][mp1];
double [][] vold = new double[np1][mp1];
double [][] pold = new double[np1][mp1];
double [][] cu = new double[np1][mp1];
double [][] cv = new double[np1][mp1];
double [][] z = new double[np1][mp1];
double [][] h = new double[np1][mp1];
double [][] psi = new double[np1][mp1];
double t1, t2;
double dt = 90.;
double dx = 1.e5;
double dy = 1.e5;
double a = 1.e6;
double alpha = 0.001;
int itmax = 120;
double el = n*dx;
double pi = 4.*Math.atan(1.);
double tpi = pi+pi;
double di = tpi/m;
double dj = tpi/n;
double pcf = pi*pi*a*a/(el*el);
double fsdx = 4./dx;
double fsdy = 4./dy;
int testsMax = 3;
int i, j, it, tests; // Loop variables
for ( tests=0; tests<testsMax; tests++ ) { // No of tests to run
// two delta t (tdt) is set to dt on the first cycle
// after which it is reset to dt+dt
double tdt = dt;
// Initial values of the stream function and p
for ( j=0; j<np1; j++ ) {
for ( i=0; i<mp1; i++ ) {
psi[j][i] = a*Math.sin((i+0.5)*di)*Math.sin((j+0.5)*dj);
p[j][i] = pcf*(Math.cos(2.*i*di) + Math.cos(2.*j*dj)) +
50000.;
}
}
// Initialise velocities
for ( j=0; j<n; j++ ) {
for ( i=0; i<m; i++ ) {
u[j][i+1] = -( psi[j+1][i+1] - psi[j][i+1] ) / dy;
v[j+1][i] = ( psi[j+1][i+1] - psi[j+1][i] ) / dx;
}
}
// Periodic continuation
for ( j=0; j<n; j++ ) {
u[j][0] = u[j][m];
v[j+1][m] = v[j+1][0];
}
for ( i=0; i<m; i++ ) {
u[n][i+1] = u[0][i+1];
v[0][i] = v[n][i];
}
u[n][0] = u[0][m];
v[0][m] = v[n][0];
for ( j=0; j<np1; j++ ) {
for ( i=0; i<mp1; i++ ) {
uold[j][i] = u[j][i];
vold[j][i] = v[j][i];
pold[j][i] = p[j][i];
}
}
t1 = 1e-3*System.currentTimeMillis();
for ( it=0; it<itmax; it++ ) {
// Compute U, V and Z
for ( j=0; j<n; j++ ) {
for ( i=0; i<m; i++ ) {
cu[j][i+1] = 0.5*(p[j][i+1]+p[j][i])*u[j][i+1];
cv[j+1][i] = 0.5*(p[j+1][i]+p[j][i])*v[j+1][i];
z[j+1][i+1] = ( fsdx*(v[j+1][i+1]-v[j+1][i]) -
fsdy*(u[j+1][i+1]-u[j][i+1])) /
(p[j][i]+p[j][i+1]+p[j+1][i+1]+p[j+1][i]);
h[j][i] = p[j][i] + 0.25 *(u[j][i+1]*u[j][i+1]+u[j][i]*u
[j][i]
+ v[j+1][i]*v[j+1][i]+v[j][i]*v[j][i]);
}
}
// Periodic continuation
for ( j=0; j<n; j++ ) {
cu[j][0] = cu[j][m];
cv[j+1][m] = cv[j+1][0];
z[j+1][0] = z[j+1][m];
h[j][m] = h[j][0];
}
for ( i=0; i<m; i++ ) {
cu[n][i+1] = cu[0][i+1];
cv[0][i] = cv[n][i];
z[0][i+1] = z[n][i+1];
h[n][i] = h[0][i];
}
cu[n][0] = cu[0][m];
cv[0][m] = cv[n][0];
z[0][0] = z[n][m];
h[n][m] = h[0][0];
// Compute new values u, v and p
double tdts8 = tdt/8.;
double tdtsdx = tdt/dx;
double tdtsdy = tdt/dy;
for ( j=0; j<n; j++ ) {
for ( i=0; i<m; i++ ) {
unew[j][i+1] = uold[j][i+1] +
tdts8*(z[j+1][i+1]+z[j][i+1]) *
(cv[j+1][i+1]+cv[j+1][i]+cv[j][i]+cv[j][i+1]) -
tdtsdx*(h[j][i+1]-h[j][i]);
vnew[j+1][i] = vold[j+1][i] - tdts8*(z[j+1][i+1]+z[j+1]
[i])
*(cu[j+1][i+1]+cu[j+1][i]+cu[j][i]+cu[j][i+1])
-tdtsdy*(h[j+1][i]-h[j][i]);
pnew[j][i] = pold[j][i] - tdtsdx*(cu[j][i+1]-cu[j][i]) -
tdtsdy*(cv[j+1][i]-cv[j][i]);
}
}
// Periodic continuation
for ( j=0; j<n; j++ ) {
unew[j][0] = unew[j][m];
vnew[j+1][m] = vnew[j+1][0];
pnew[j][m] = pnew[j][0];
}
for ( i=0; i<m; i++ ) {
unew[n][i+1] = unew[0][i+1];
vnew[0][i] = vnew[n][i];
pnew[n][i] = pnew[0][i];
}
unew[n][0] = unew[0][m];
vnew[0][m] = vnew[n][0];
pnew[n][m] = pnew[0][0];
if ( it>0 ) {
for ( j=0; j<n; j++ ) {
for ( i=0; i<m; i++ ) {
uold[j][i] = u[j][i]+alpha*(unew[j][i]-2.*u[j][i]
+uold[j][i]);
vold[j][i] = v[j][i]+alpha*(vnew[j][i]-2.*v[j][i]
+vold[j][i]);
pold[j][i] = p[j][i]+alpha*(pnew[j][i]-2.*p[j][i]
+pold[j][i]);
u[j][i] = unew[j][i];
v[j][i] = vnew[j][i];
p[j][i] = pnew[j][i];
}
}
// Periodic continuation
for ( j=0; j<n; j++ ) {
uold[j][m] = uold[j][0];
vold[j][m] = vold[j][0];
pold[j][m] = pold[j][0];
u[j][m] = u[j][0];
v[j][m] = v[j][0];
p[j][m] = p[j][0];
}
for ( i=0; i<m; i++ ) {
uold[n][i] = uold[0][i];
vold[n][i] = vold[0][i];
pold[n][i] = pold[0][i];
u[n][i] = u[0][i];
v[n][i] = v[0][i];
p[n][i] = p[0][i];
}
uold[n][m] = uold[0][0];
vold[n][m] = vold[0][0];
pold[n][m] = pold[0][0];
u[n][m] = u[0][0];
v[n][m] = v[0][0];
p[n][m] = p[0][0];
}
else {
tdt = tdt+tdt;
for ( j=0; j<np1; j++ ) {
for ( i=0; i<mp1; i++ ) {
uold[j][i] = u[j][i];
vold[j][i] = v[j][i];
pold[j][i] = p[j][i];
u[j][i] = unew[j][i];
v[j][i] = vnew[j][i];
p[j][i] = pnew[j][i];
}
}
}
}
t2 = 1e-3*System.currentTimeMillis();
double psum = 0.;
for ( j=0; j<n; j++ )
for ( i=0; i<m; i++ )
psum += p[j][i];
System.out.println("PSUM: " + psum);
// 65 FP operations per point per iteration
System.out.println(" MFLOPS: " + 1e-6*65*m*n*itmax/(t2-t1));
}
}
}
---------- END SOURCE ----------
(Review ID: 159063)
======================================================================
|
|
Comments
|
Submitted On 16-OCT-2002
hlovatt
If the code given is broken into a set of smaller methods
instead of one large main method the problem goes away. Is
there an overflow in the server VM?
Submitted On 25-OCT-2002
wmbondy
I am seeing the same problem. I have a web based server
application. With the 1.4.0.02 jvm in client mode I get very
consistant cpu utilization results. So consistent that I rarely
see significant changes in performance tweaking jvm tuning
parameters.
Using the server jvm, my results 80% of the time are at worse
(more cpu usage) by at least 40+%. On the rare occasions
the server jvm runs smoothly, it out performs the client jvm
by 30% (30% less cpu usage per app transaction).
One thing is for sure, the server jvm is very erratic and flaky
for cpu intensive applications on 1.4.0 and 1.4.1 jvms. The
server jvm appears to have the ability to be a big
performance gain if it would only run consistently.
Submitted On 03-NOV-2002
hlovatt
I am dissapointed that this is closed, this causes some
problems for us. We are translating Fortran code (like the
example code above) which typically has a few large methods
and therefore we see this problem. What is the purpose of
the "too long" catch in the compiler? Is it possible to re-open
this bug so that this issue can be solved.
Submitted On 03-JAN-2003
spf1
The bug was closed as "fixed". That means that they agreed
it was a bug and they fixed it in an internal version of the
JVM. You'll probably see the fix in the next released
version (1.4.2?).
Submitted On 01-JUN-2003
ibtuser
I have not commented to this group before and am a relative
newbie to Java - here goes:
I ran this code using eclipse 2.1 on Windows XP using a single
CPU Dell laptop and the 1.4.2 beta jre. This time I got better
results, but the are still erratic compared with the -client
option.
-client:
PSUM: 3.2768000000000205E9
MFLOPS: 68.98526346864124
PSUM: 3.2768000000000205E9
MFLOPS: 68.70709624577263
PSUM: 3.2768000000000205E9
MFLOPS: 68.61487204405206
-server
PSUM: 3.276800000000021E9
MFLOPS: 21.985325430883826
PSUM: 3.2768000000000205E9
MFLOPS: 46.3908519396008
PSUM: 3.276800000000021E9
MFLOPS: 44.44663970276141
and again -server:
PSUM: 3.276800000000021E9
MFLOPS: 22.02321316428017
PSUM: 3.2768000000000205E9
MFLOPS: 48.64682139430102
PSUM: 3.276800000000021E9
MFLOPS: 45.80883575668473
different results, same pattern - the MFLOPS result gets
better on consecutive benchmarks in the code but the
pattern remains the same on consecutive runs of the code.
I am not sure if I would consider this fixed, but I could be
overlooking something here.
Submitted On 01-JUN-2003
ibtuser
I ran this test again on a Compaq DL380 using Windows 2000
advanced server and the 1.4.2 beta code.
This time I modified the lood to run 10 times - See what
happens:
-server
Iteration:0
PSUM: 3.276800000000021E9
MFLOPS: 25.55904
Iteration:1
PSUM: 3.2768000000000205E9
MFLOPS: 64.1542164297958
Iteration:2
PSUM: 3.276800000000021E9
MFLOPS: 68.15744
Iteration:3
PSUM: 3.276800000000021E9
MFLOPS: 110.90926526279557
Iteration:4
PSUM: 3.276800000000021E9
MFLOPS: 112.02735131707847
Iteration:5
PSUM: 3.276800000000021E9
MFLOPS: 110.90926526279557
Iteration:6
PSUM: 3.276800000000021E9
MFLOPS: 112.02735131707847
Iteration:7
PSUM: 3.276800000000021E9
MFLOPS: 112.42155436977689
Iteration:8
PSUM: 3.276800000000021E9
MFLOPS: 112.02735131707847
Iteration:9
PSUM: 3.276800000000021E9
MFLOPS: 109.7896931053657
-client
Iteration:0
PSUM: 3.2768000000000205E9
MFLOPS: 95.36955198427754
Iteration:1
PSUM: 3.2768000000000205E9
MFLOPS: 97.36777142857143
Iteration:2
PSUM: 3.2768000000000205E9
MFLOPS: 96.77788815993779
Iteration:3
PSUM: 3.2768000000000205E9
MFLOPS: 97.07193475585377
Iteration:4
PSUM: 3.2768000000000205E9
MFLOPS: 97.0719325583839
Iteration:5
PSUM: 3.2768000000000205E9
MFLOPS: 96.77788597576074
Iteration:6
PSUM: 3.2768000000000205E9
MFLOPS: 94.83873865615566
Iteration:7
PSUM: 3.2768000000000205E9
MFLOPS: 96.50383143708495
Iteration:8
PSUM: 3.2768000000000205E9
MFLOPS: 97.36777142857143
Iteration:9
PSUM: 3.2768000000000205E9
MFLOPS: 97.0719325583839
It seems that the result stabilises after 3 iterations for the
server, and does not change noticable for the client vm.
PLEASE NOTE: JDK6 is formerly known as Project Mustang
|