是否可以估算出英特尔Nehalem架构可实现的最大每周期指令?另外,影响最大每循环指令的瓶颈是什么?
我对任何评论持开放态度。
谢谢
答案 0 :(得分:2)
TL:DR :每个时钟有5个insn,其中一个是test
或cmp
macro-fuse jcc
cmp / jcc
来源:Agner Fog's microarch pdf and instruction tables。另请参阅x86标记wiki。
Intel Core2及更高版本中的无序管道可以每个时钟发出/重命名4个融合域uop。这是瓶颈。宏融合将;; Should run at one iteration per clock.
.l:
mov edx, [rsi] ; doesn't need an ALU uop. nop would work here, too
add eax, edx
inc rsi
cmp rsi, rdi
jb .l
组合成一个uop,但每个解码块只能发生一次。 (直到Haswell)。
即使您可以安排事情,因此每4个uop中有一个以上的宏融合对在循环中,Nehalem每个时钟(端口5)的吞吐量只有一个融合的测试和分支uop。因此,即使其中一些不被采用,它也无法在每个时钟维持一个以上的宏融合比较和分支。 (Haswell可以在端口0或端口6上运行未采用的分支。)
mov
为了便于测试和删除缓存/内存瓶颈,您可以将其更改为每次从同一位置加载,而不是在寻址模式下使用循环计数器。
请注意,Haswell之前的搜索只有三个ALU端口。 push
加载或存储占用管道带宽,因此有4个广泛的发布/重命名是有好处的。它对于前端能够比乱序核心执行的速度更快发出也很有用,因此在调度程序中总是有一个排队的工作缓冲区,所以它可以找到指令级的并行性并尽早开始未来的加载,以及类似的东西。
我认为除了加载/存储(包括pop
/ nop
感谢堆栈引擎),fxchg
和mov
是唯一的融合域uops在Nehalem需要一个ALU端口。关于SnB家族的搜索,xor same,same
is also handled in the rename/issue stage,有时也注册<script id="tunnelVertexShader" type="x-shader/x-vertex">
varying vec3 vPosition;
void main( void ) {
vPosition = position;
gl_Position = projectionMatrix * modelViewMatrix * vec4(position,1.0);
}
</script>
<script id="tunnelFragmentShader" type="x-shader/x-fragment">
varying vec3 vPosition;
uniform vec3 color;
uniform vec3 noiseScale;
uniform float speed;
uniform float time;
uniform float intensity;
//
// Description : Array and textureless GLSL 2D/3D/4D simplex
// noise functions.
// Author : Ian McEwan, Ashima Arts.
// Maintainer : ijm
// Lastmod : 20110822 (ijm)
// License : Copyright (C) 2011 Ashima Arts. All rights reserved.
// Distributed under the MIT License. See LICENSE file.
// https://github.com/ashima/webgl-noise
//
vec4 mod289(vec4 x) {
return x - floor(x * (1.0 / 289.0)) * 289.0; }
float mod289(float x) {
return x - floor(x * (1.0 / 289.0)) * 289.0; }
vec4 permute(vec4 x) {
return mod289(((x*34.0)+1.0)*x);
}
float permute(float x) {
return mod289(((x*34.0)+1.0)*x);
}
vec4 taylorInvSqrt(vec4 r)
{
return 1.79284291400159 - 0.85373472095314 * r;
}
float taylorInvSqrt(float r)
{
return 1.79284291400159 - 0.85373472095314 * r;
}
vec4 grad4(float j, vec4 ip)
{
const vec4 ones = vec4(1.0, 1.0, 1.0, -1.0);
vec4 p,s;
p.xyz = floor( fract (vec3(j) * ip.xyz) * 7.0) * ip.z - 1.0;
p.w = 1.5 - dot(abs(p.xyz), ones.xyz);
s = vec4(lessThan(p, vec4(0.0)));
p.xyz = p.xyz + (s.xyz*2.0 - 1.0) * s.www;
return p;
}
// (sqrt(5) - 1)/4 = F4, used once below
#define F4 0.309016994374947451
float snoise(vec4 v)
{
const vec4 C = vec4( 0.138196601125011, // (5 - sqrt(5))/20 G4
0.276393202250021, // 2 * G4
0.414589803375032, // 3 * G4
-0.447213595499958); // -1 + 4 * G4
// First corner
vec4 i = floor(v + dot(v, vec4(F4)) );
vec4 x0 = v - i + dot(i, C.xxxx);
// Other corners
// Rank sorting originally contributed by Bill Licea-Kane, AMD (formerly ATI)
vec4 i0;
vec3 isX = step( x0.yzw, x0.xxx );
vec3 isYZ = step( x0.zww, x0.yyz );
// i0.x = dot( isX, vec3( 1.0 ) );
i0.x = isX.x + isX.y + isX.z;
i0.yzw = 1.0 - isX;
// i0.y += dot( isYZ.xy, vec2( 1.0 ) );
i0.y += isYZ.x + isYZ.y;
i0.zw += 1.0 - isYZ.xy;
i0.z += isYZ.z;
i0.w += 1.0 - isYZ.z;
// i0 now contains the unique values 0,1,2,3 in each channel
vec4 i3 = clamp( i0, 0.0, 1.0 );
vec4 i2 = clamp( i0-1.0, 0.0, 1.0 );
vec4 i1 = clamp( i0-2.0, 0.0, 1.0 );
// x0 = x0 - 0.0 + 0.0 * C.xxxx
// x1 = x0 - i1 + 1.0 * C.xxxx
// x2 = x0 - i2 + 2.0 * C.xxxx
// x3 = x0 - i3 + 3.0 * C.xxxx
// x4 = x0 - 1.0 + 4.0 * C.xxxx
vec4 x1 = x0 - i1 + C.xxxx;
vec4 x2 = x0 - i2 + C.yyyy;
vec4 x3 = x0 - i3 + C.zzzz;
vec4 x4 = x0 + C.wwww;
// Permutations
i = mod289(i);
float j0 = permute( permute( permute( permute(i.w) + i.z) + i.y) + i.x);
vec4 j1 = permute( permute( permute( permute (
i.w + vec4(i1.w, i2.w, i3.w, 1.0 ))
+ i.z + vec4(i1.z, i2.z, i3.z, 1.0 ))
+ i.y + vec4(i1.y, i2.y, i3.y, 1.0 ))
+ i.x + vec4(i1.x, i2.x, i3.x, 1.0 ));
// Gradients: 7x7x6 points over a cube, mapped onto a 4-cross polytope
// 7*7*6 = 294, which is close to the ring size 17*17 = 289.
vec4 ip = vec4(1.0/294.0, 1.0/49.0, 1.0/7.0, 0.0) ;
vec4 p0 = grad4(j0, ip);
vec4 p1 = grad4(j1.x, ip);
vec4 p2 = grad4(j1.y, ip);
vec4 p3 = grad4(j1.z, ip);
vec4 p4 = grad4(j1.w, ip);
// Normalise gradients
vec4 norm = taylorInvSqrt(vec4(dot(p0,p0), dot(p1,p1), dot(p2, p2), dot(p3,p3)));
p0 *= norm.x;
p1 *= norm.y;
p2 *= norm.z;
p3 *= norm.w;
p4 *= taylorInvSqrt(dot(p4,p4));
// Mix contributions from the five corners
vec3 m0 = max(0.6 - vec3(dot(x0,x0), dot(x1,x1), dot(x2,x2)), 0.0);
vec2 m1 = max(0.6 - vec2(dot(x3,x3), dot(x4,x4) ), 0.0);
m0 = m0 * m0;
m1 = m1 * m1;
return 49.0 * ( dot(m0*m0, vec3( dot( p0, x0 ), dot( p1, x1 ), dot( p2, x2 )))
+ dot(m1*m1, vec2( dot( p3, x3 ), dot( p4, x4 ) ) ) ) ;
}
float turbulence( vec3 p ) {
float t = -0.5;
for (float f = 1.0 ; f <= 5.0 ; f++ ){
float power = pow( 2.0, f );
t += abs( snoise( vec4( power * p, time )));
}
return t / 5.0 * intensity;
}
void main() {
vec3 nPos = vec3(vPosition.x, vPosition.y - (speed * time), vPosition.z);
float n = turbulence(nPos / (0.0 - noiseScale));
vec3 finalColor = vec3(color.x * n, color.y * n, color.z * n);
float finalAlpha = finalColor.x + finalColor.y + finalColor.z;
gl_FragColor = vec4(finalColor, finalAlpha);
}
</script>
s(IvB及更高版本)。