Young risk taker.: [Cell] SPE上での実行時間計測

SPE上で実行時間を計測するには、SPU Decrementerを利用する。
SPU Decrementerは参考資料のPDFに次のように説明されている。

A register that counts down each time an event occurs. Each SPU
contains dedicated 32-bit decrementers for scheduling or performance
monitoring, by the program or by the SPU itself.

プロファイリングのために利用できる32bitレジスタで一定周期で値が減少していく。この差を取ることで実行時間の計測が出来る。

PPUのTime Base Registerと同じように周期を知るには、/proc/cpuinfoを見る。


% cat /proc/cpuinfo | grep -w timebase
timebase: 25000000
%

SPU Decrementerもこの値が周期の基準になるのか少し疑問だが、fixstartsのWikiでもこの値を使っているようなので。

SPUチャネルにSPU_WrDec, SPU_RdDecというニーモニックを指定し書き込む or 読み込むことでレジスタの値が取得できる。


uint32_t t;
spu_writech(SPU_WrDec, 0xffffffff); // 0xffffffffを書き込み
t = spu_readch(SPU_RdDec); // SPU Decrementerの値を読み込む。€‚

この低レベルAPIをラップした関数として、spu_write_decrementer(uint32_t), spu_read_decrementer(void)という関数がspu_mfcio.hで提供されている。


uint32_t t;
spu_write_decrementer(0xffffffff);
t = spu_read_decrementer();

この関数を利用したプロファイリング用のマクロを定義する。


static const int TIMEBASE = 2.5 * 1.0e7; // cat /proc/cpuinfo | grep -w timebase

// プロファイル用関数ƒ
#define StartTimer(ts) {spu_write_decrementer(0xffffffff); ts=spu_read_decrementer();}
#define StopTimer(te) {te -= spu_read_decrementer();}
#define PrintTimer(te) {printf("timer: %f(sec)\n", te / (float)TIMEBASE * 1.0e3);}

初期値としてSPU Decrementerに大きな値を設定しておく。StopTimerでStartTimer時に取得した値との差を計算する。PrintTimerでレジスタの減少値から実行時間を算出する。
これを利用してスカラー演算とベクタ演算の実行速度を比較する。

ソースコード


#include <stdio.h>
#include <stdint.h>
#include <spu_intrinsics.h>
#include <unistd.h>
#include <spu_mfcio.h> // spu_read_decrementer and spu_write_decrementer                        
                                                                                                                              
static const int N = 200000;
static const int TIMEBASE = 2.5 * 1.0e7; // cat /proc/cpuinfo | grep -w timebase

// プロファイリング用マクロ
#define StartTimer(ts) {spu_write_decrementer(0xffffffff); \
                        ts=spu_read_decrementer();}
#define StopTimer(te) {te -= spu_read_decrementer();}
#define PrintTimer(te) {printf("timer: %f(msec)\n", te / (float)TIMEBASE * 1.0e3);}

int main(unsigned long long spu_id, unsigned long long arg) {                                                                                                        
  uint32_t profile, profile_simd, i;                                                                                                                                   
uint32_t in[4] __attribute__((aligned(16))) = {1,2,3,4};                                                                                                             
uint32_t out[4] __attribute__((aligned(16))) = {0};                                                                                                                
vec_int4 *v_in = (vec_int4 *) in;
vec_int4 *v_out = (vec_int4 *) out;

  // スカラー値で計算
StartTimer(profile);
  for(i=0;i<N;i++) {
  out[0] += in[0];
  out[1] += in[1];
  out[2] += in[2];
  out[3] += in[3];
}
StopTimer(profile);

  // SIMDでベクタ演算
StartTimer(profile_simd)
  for(i=0;i<N;i++) {
  spu_add(*v_out, *v_in);
}
StopTimer(profile_simd);

  // 出力
printf("Scalar: ");
PrintTimer(profile);

printf("Vector: ");
PrintTimer(profile_simd);

  return 0;
}

コンパイルとシミュレータへの転送用に/tmpへコピー


% make                                                                                                 
spu-gcc -Wall -I/opt/IBM/cell-sdk-1.1/src/include/spu/ -I/opt/IBM/cell-sdk-1.1/src/include/ -I/opt/IBM/cell-sdk-1.1/sysroot/usr/lib/gcc/spu/4.0.2/ 0.c -c
spu-gcc -L/opt/IBM/cell-sdk-1.1/sysroot/usr/lib/ 0.o  -o spu.out
% make install
cp spu.out ../ppu-main/ppu.out /tmp/
%

SPUプログラムを動かすためのPPUプログラムを作る。

#include <stdio.h>
#include <libspe.h>

static const char SPE_FILE_NAME[] = "./spu.out";                                                                                                             
int main() {
  int status;
spe_program_handle_t *spe_handle;
speid_t spe_id;

printf("[PPE] Open SPE program.\n");
spe_handle = spe_open_image(SPE_FILE_NAME);
  if(spe_handle == 0) {
printf("ERROR: Cannot open SPE program.\n");
    return 1;
}

printf("[PPE] Create SPE thread.\n");
spe_id = spe_create_thread(0, spe_handle, NULL, NULL, -1, 0);
  if(spe_id == 0) {
printf("ERROR: Cannot crate SPE thread.\n");
    return 1;
}

printf("[PPE] Waiting SPE thread...");
spe_wait(spe_id, &status, 0);
printf("done.\n");

printf("[PPE] Release SPE program.\n");
spe_close_image(spe_handle);

  return 0;
}

コンパイルと転送

% make                                                                                                   
ppu-gcc -m32 -Wall -I/opt/IBM/cell-sdk-1.1/sysroot/usr/include 1.c -c
ppu-gcc -L/opt/IBM/cell-sdk-1.1/sysroot/usr/lib 1.o -lspe -o ppu.out -m32
% make install
cp ../spu-main/spu.out ppu.out /tmp/
%

シミュレータ上に転送し実行

# callthrue source /tmp/ppu.out > ppu.out
# callthrue source /tmp/spu.out > spu.out
# chmod +x ppu.out spu.out
# ./ppu.out
[PPE] Open SPE program.
[PPE] Create SPE thread.
Scalar: timer: 2.812480(sec)
Vector: timer: 0.687480(sec)
[PPE] Waitin SPE thread...done.
[PPE] Release SPE program.
#

追記
SPUの実行形式オブジェクトファイル単体でも実行可能。

SIMDを利用したベクタ演算の方が4倍ほど早くなっています。
SIMDでは同時に4データを扱えるので妥当なところかな。

参考
Cell Broadband Engine Programming Tutorial Version 1.1 (PDF)
lesson12 - Pukiwiki

Young risk taker.

Tuesday, November 28, 2006

[Cell] SPE上での実行時間計測

About Me

I love

Blog Archive

なかのひと