我在一个应用程序中使用了混合的C和Haskell线程,并没有发现在两者之间切换时有太大的性能损失。因此,我设计了一个简单的基准测试......它比Don的测试要快/便宜得多。这是在2.66GHz i7上测量1000万次迭代的结果:
$ ./foo
IO : 2381952795 nanoseconds total, 238.195279 nanoseconds per, 160000000 value
Pure: 2188546976 nanoseconds total, 218.854698 nanoseconds per, 160000000 value
使用 GHC 7.0.3/x86_64 和 gcc-4.2.1 在 OSX 10.6 上编译
ghc -no-hs-main -lstdc++ -O2 -optc-O2 -o foo ForeignExportCost.hs Driver.cpp
Haskell:
{-# LANGUAGE ForeignFunctionInterface #-}
module ForeignExportCost where
import Foreign.C.Types
foreign export ccall simpleFunction :: CInt -> CInt
simpleFunction i = i * i
foreign export ccall simpleFunctionIO :: CInt -> IO CInt
simpleFunctionIO i = return (i * i)
同时还需要一个OSX C++应用程序来驱动它,很容易调整为Windows或Linux:
#include <stdio.h>
#include <mach/mach_time.h>
#include <mach/kern_return.h>
#include <HsFFI.h>
#include "ForeignExportCost_stub.h"
static const int s_loop = 10000000;
int main(int argc, char** argv) {
hs_init(&argc, &argv);
struct mach_timebase_info timebase_info = { };
kern_return_t err;
err = mach_timebase_info(&timebase_info);
if (err != KERN_SUCCESS) {
fprintf(stderr, "error: %x\n", err);
return err;
}
uint64_t start = mach_absolute_time();
HsInt32 val = 0;
for (int i = 0; i < s_loop; ++i) {
val += simpleFunctionIO(4);
}
uint64_t duration = (mach_absolute_time() - start) * timebase_info.numer / timebase_info.denom;
double duration_per = static_cast<double>(duration) / s_loop;
printf("IO : %lld nanoseconds total, %f nanoseconds per, %d value\n", duration, duration_per, val);
start = mach_absolute_time();
val = 0;
for (int i = 0; i < s_loop; ++i) {
val += simpleFunction(4);
}
duration = (mach_absolute_time() - start) * timebase_info.numer / timebase_info.denom;
duration_per = static_cast<double>(duration) / s_loop;
printf("Pure: %lld nanoseconds total, %f nanoseconds per, %d value\n", duration, duration_per, val);
hs_exit();
}