Cython函数比纯Python函数需要更多的时间

Question

Cython函数比纯Python函数需要更多的时间

4

我正在尝试加速我的代码，但其中的这一部分给了我问题，

我尝试使用Cython，然后按照这里给出的建议，但我的纯Python函数比Cython和优化过的Cython都表现更好。

下面是Cython代码：

import numpy as np
cimport numpy as np

DTYPE = np.float
ctypedef np.float_t DTYPE_t

cimport cython
@cython.boundscheck(False)
@cython.wraparound(False) 

def compute_cython(u, PorosityProfile, DensityIceProfile, DensityDustProfile, DensityProfile):

    DustJ, DustF, DustG, DustH, DustI = 250.0, 633.0, 2.513, -2.2e-3, -2.8e-6   
    IceI, IceC, IceD, IceE, IceF, IceG, IceH =  273.16, 1.843e5, 1.6357e8, 3.5519e9, 1.6670e2,  6.4650e4, 1.6935e6

    delta = u-DustJ
    result_dust = DustF+DustG*delta+DustH*delta**2+DustI*(delta**3);

    x= u/IceI;
    result_ice = (x**3)*(IceC+IceD*(x**2)+IceE*(x**6))/(1+IceF*(x**2)+IceG*(x**4)+IceH*(x**8))

    return (DensityIceProfile*result_ice+DensityDustProfile*result_dust)/DensityProfile


def compute_cythonOptimized(np.ndarray[DTYPE_t, ndim=1] u, np.ndarray[DTYPE_t, ndim=1] PorosityProfile, np.ndarray[DTYPE_t, ndim=1] DensityIceProfile, np.ndarray[DTYPE_t, ndim=1] DensityDustProfile, np.ndarray DensityProfile):

    assert u.dtype == DTYPE
    assert PorosityProfile.dtype == DTYPE
    assert DensityIceProfile.dtype == DTYPE
    assert DensityDustProfile.dtype == DTYPE
    assert DensityProfile.dtype == DTYPE

    cdef float DustJ = 250.0
    cdef float DustF = 633.0  
    cdef float DustG = 2.513 
    cdef float DustH = -2.2e-3   
    cdef float DustI = -2.8e-6 
    cdef float IceI =  273.16
    cdef float IceC =  1.843e5 
    cdef float IceD =  1.6357e8 
    cdef float IceE =  3.5519e9 
    cdef float IceF =  1.6670e2 
    cdef float IceG =  6.4650e4
    cdef float IceH =  1.6935e6

    cdef np.ndarray[DTYPE_t, ndim=1] delta = u-DustJ
    cdef np.ndarray[DTYPE_t, ndim=1] result_dust = DustF+DustG*delta+DustH*delta**2+DustI*(delta**3);

    cdef np.ndarray[DTYPE_t, ndim=1] x= u/IceI;
    cdef np.ndarray[DTYPE_t, ndim=1] result_ice = (x**3)*(IceC+IceD*(x**2)+IceE*(x**6))/(1+IceF*(x**2)+IceG*(x**4)+IceH*(x**8))

    return (DensityIceProfile*result_ice+DensityDustProfile*result_dust)/DensityProfile

我接着运行以下命令：

def compute_python(u, PorosityProfile, DensityIceProfile, DensityDustProfile, DensityProfile):

    DustJ, DustF, DustG, DustH, DustI = 250.0, 633.0, 2.513, -2.2e-3, -2.8e-6   
    IceI, IceC, IceD, IceE, IceF, IceG, IceH =  273.16, 1.843e5, 1.6357e8, 3.5519e9, 1.6670e2,  6.4650e4, 1.6935e6

    delta = u-DustJ
    result_dust = DustF+DustG*delta+DustH*delta**2+DustI*(delta**3);

    x= u/IceI;
    result_ice = (x**3)*(IceC+IceD*(x**2)+IceE*(x**6))/(1+IceF*(x**2)+IceG*(x**4)+IceH*(x**8))

    return (DensityIceProfile*result_ice+DensityDustProfile*result_dust)/DensityProfile

import sublimation
import numpy as np

%timeit compute_python(np.random.rand(100),np.random.rand(100),np.random.rand(100),np.random.rand(100),np.random.rand(100))

%timeit compute_cython(np.random.rand(100),np.random.rand(100),np.random.rand(100),np.random.rand(100),np.random.rand(100))

%timeit compute_cythonOptimized(np.random.rand(100),np.random.rand(100),np.random.rand(100),np.random.rand(100),np.random.rand(100))

这导致了以下结果：

对于纯Python：68.9 µs±851ns每个循环（7次运行的平均值±标准偏差，每个10000个循环） 对于未优化的Cython：68.2 µs±685ns每个循环（7次运行的平均值±标准偏差，每个10000个循环） 而对于优化后的代码：72.7 µs±416ns每个循环（7次运行的平均值±标准偏差，每个10000个循环） 我做错了什么？

感谢您的帮助，

- Anthony Lethuillier

3

没什么需要优化的；numpy 已经在尽力了，而且它已经是用 C 写成的。你只是增加了一层开销。 - chepner

1

除了chepner所说的，由于您已经处于“numpy”世界中，请考虑使用“numba”。 - juanpa.arrivillaga

1

你是否正在使用函数测量随机数生成？我在Python版本中得到了46.8微秒，而在简单的Numba版本(@numba.njit(fastmath=True))中得到了3.11微秒。你的数组每次都这么小吗（只有100个值）？如果不是，那么并行化也会有益处。 - max9111

2个回答

4

使用Numba的解决方案

CodeSurgeon已经给出了一个使用Cython的优秀答案。在这个答案中，我想展示一种使用Numba的替代方法。

我创建了三个版本。在naive_numba中，我仅添加了一个函数装饰器。在improved_Numba中，我手动结合了循环（每个矢量化命令实际上是一个循环）。在improved_Numba_p中，我并行化了函数。请注意，显然存在一个错误，不允许在使用并行加速器时定义常量值。还应该注意到，并行化版本仅对较大的输入数组有益。但是您也可以添加一个小包装器，根据输入数组大小调用单线程或并行化版本。

代码数据类型为float64

import numba as nb
import numpy as np
import time



@nb.njit(fastmath=True)
def naive_Numba(u, PorosityProfile, DensityIceProfile, DensityDustProfile, DensityProfile):
  DustJ, DustF, DustG, DustH, DustI = 250.0, 633.0, 2.513, -2.2e-3, -2.8e-6   
  IceI, IceC, IceD, IceE, IceF, IceG, IceH =  273.16, 1.843e5, 1.6357e8, 3.5519e9, 1.6670e2,  6.4650e4, 1.6935e6

  delta = u-DustJ
  result_dust = DustF+DustG*delta+DustH*delta**2+DustI*(delta**3);

  x= u/IceI;
  result_ice = (x**3)*(IceC+IceD*(x**2)+IceE*(x**6))/(1+IceF*(x**2)+IceG*(x**4)+IceH*(x**8))

  return (DensityIceProfile*result_ice+DensityDustProfile*result_dust)/DensityProfile

#error_model='numpy' sets divison by 0 to NaN instead of throwing a exception, this allows vectorization
@nb.njit(fastmath=True,error_model='numpy')
def improved_Numba(u, PorosityProfile, DensityIceProfile, DensityDustProfile, DensityProfile):
  DustJ, DustF, DustG, DustH, DustI = 250.0, 633.0, 2.513, -2.2e-3, -2.8e-6   
  IceI, IceC, IceD, IceE, IceF, IceG, IceH =  273.16, 1.843e5, 1.6357e8, 3.5519e9, 1.6670e2,  6.4650e4, 1.6935e6
  res=np.empty(u.shape[0],dtype=u.dtype)

  for i in range(u.shape[0]):
    delta = u[i]-DustJ
    result_dust = DustF+DustG*delta+DustH*delta**2+DustI*(delta**3);

    x= u[i]/IceI
    result_ice = (x**3)*(IceC+IceD*(x**2)+IceE*(x**6))/(1+IceF*(x**2)+IceG*(x**4)+IceH*(x**8))

    res[i]=(DensityIceProfile[i]*result_ice+DensityDustProfile[i]*result_dust)/DensityProfile[i]

  return res

#there is obviously a bug in Numba (declaring const values in the function)
@nb.njit(fastmath=True,parallel=True,error_model='numpy')
def improved_Numba_p(u, PorosityProfile, DensityIceProfile, DensityDustProfile, DensityProfile,DustJ, DustF, DustG, DustH, DustI,IceI, IceC, IceD, IceE, IceF, IceG, IceH):
  res=np.empty((u.shape[0]),dtype=u.dtype)

  for i in nb.prange(u.shape[0]):
    delta = u[i]-DustJ
    result_dust = DustF+DustG*delta+DustH*delta**2+DustI*(delta**3);

    x= u[i]/IceI
    result_ice = (x**3)*(IceC+IceD*(x**2)+IceE*(x**6))/(1+IceF*(x**2)+IceG*(x**4)+IceH*(x**8))

    res[i]=(DensityIceProfile[i]*result_ice+DensityDustProfile[i]*result_dust)/DensityProfile[i]

  return res

u=np.array(np.random.rand(1000000),dtype=np.float32)
PorosityProfile=np.array(np.random.rand(1000000),dtype=np.float32)
DensityIceProfile=np.array(np.random.rand(1000000),dtype=np.float32)
DensityDustProfile=np.array(np.random.rand(1000000),dtype=np.float32)
DensityProfile=np.array(np.random.rand(1000000),dtype=np.float32)
DustJ, DustF, DustG, DustH, DustI = 250.0, 633.0, 2.513, -2.2e-3, -2.8e-6
IceI, IceC, IceD, IceE, IceF, IceG, IceH =  273.16, 1.843e5, 1.6357e8, 3.5519e9, 1.6670e2,  6.4650e4, 1.6935e6

#don't measure compilation overhead on first call
res=improved_Numba_p(u, PorosityProfile, DensityIceProfile, DensityDustProfile, DensityProfile,DustJ, DustF, DustG, DustH, DustI,IceI, IceC, IceD, IceE, IceF, IceG, IceH) 
for i in range(1000):
  res=improved_Numba_p(u, PorosityProfile, DensityIceProfile, DensityDustProfile, DensityProfile,DustJ, DustF, DustG, DustH, DustI,IceI, IceC, IceD, IceE, IceF, IceG, IceH)

print(time.time()-t1)
print(time.time()-t1)

性能

Arraysize np.random.rand(100)
Numpy             46.8µs
naive Numba       3.1µs
improved Numba:   1.62µs
improved_Numba_p: 17.45µs


#Arraysize np.random.rand(1000000)
Numpy             255.8ms
naive Numba       18.6ms
improved Numba:   6.13ms
improved_Numba_p: 3.54ms

代码 dtype=np.float32

如果np.float32足够使用，你需要在函数中显式声明所有常量值为float32。否则Numba将使用float64。

@nb.njit(fastmath=True,error_model='numpy')
def improved_Numba(u, PorosityProfile, DensityIceProfile, DensityDustProfile, DensityProfile):
  DustJ, DustF, DustG, DustH, DustI = nb.float32(250.0), nb.float32(633.0), nb.float32(2.513), nb.float32(-2.2e-3), nb.float32(-2.8e-6)
  IceI, IceC, IceD, IceE, IceF, IceG, IceH =  nb.float32(273.16), nb.float32(1.843e5), nb.float32(1.6357e8), nb.float32(3.5519e9), nb.float32(1.6670e2),  nb.float32(6.4650e4), nb.float32(1.6935e6)
  res=np.empty(u.shape[0],dtype=u.dtype)

  for i in range(u.shape[0]):
    delta = u[i]-DustJ
    result_dust = DustF+DustG*delta+DustH*delta**2+DustI*(delta**3)

    x= u[i]/IceI
    result_ice = (x**3)*(IceC+IceD*(x**2)+IceE*(x**6))/(nb.float32(1)+IceF*(x**2)+IceG*(x**4)+IceH*(x**8))

    res[i]=(DensityIceProfile[i]*result_ice+DensityDustProfile[i]*result_dust)/DensityProfile[i]

  return res

@nb.njit(fastmath=True,parallel=True,error_model='numpy')
def improved_Numba_p(u, PorosityProfile, DensityIceProfile, DensityDustProfile, DensityProfile):
  res=np.empty((u.shape[0]),dtype=u.dtype)
  DustJ, DustF, DustG, DustH, DustI = nb.float32(250.0), nb.float32(633.0), nb.float32(2.513), nb.float32(-2.2e-3), nb.float32(-2.8e-6)
  IceI, IceC, IceD, IceE, IceF, IceG, IceH =  nb.float32(273.16), nb.float32(1.843e5), nb.float32(1.6357e8), nb.float32(3.5519e9), nb.float32(1.6670e2),  nb.float32(6.4650e4), nb.float32(1.6935e6)

  for i in nb.prange(u.shape[0]):
    delta = u[i]-DustJ
    result_dust = DustF+DustG*delta+DustH*delta**2+DustI*(delta**3)

    x= u[i]/IceI
    result_ice = (x**3)*(IceC+IceD*(x**2)+IceE*(x**6))/(nb.float32(1)+IceF*(x**2)+IceG*(x**4)+IceH*(x**8))

    res[i]=(DensityIceProfile[i]*result_ice+DensityDustProfile[i]*result_dust)/DensityProfile[i]

  return res

性能

Arraysize np.random.rand(100).astype(np.float32)
Numpy             29.3µs
improved Numba:   1.33µs
improved_Numba_p: 18µs


Arraysize np.random.rand(1000000).astype(np.float32)
Numpy             117ms
improved Numba:   2.46ms
improved_Numba_p: 1.56ms

与@CodeSurgeon提供的Cython版本的比较并不公平，因为他没有使用启用了AVX2和FMA3指令的函数进行编译。Numba默认使用-march=native进行编译，在我的Core i7-4xxx上启用了AVX2和FMA3指令。

但是，如果您想要分发已编译的Cython版本的代码，这就有意义了��因为如果启用了这些优化，它将无法在预Haswell处理器（或所有Pentium和Celerons）上默认运行。编译多个代码路径应该是可能的，但取决于编译器，并且需要更多的工作。

- max9111

这看起来很令人兴奋！我回家后会尝试使用march=native选项编译我的代码，并包含一个使用cython的prange的并行版本。我还会尝试在我的奇怪构建环境中设置numba（在Windows上运行所有代码以使用mingw提供的方便预构建软件包）。 - CodeSurgeon

谢谢你的回答，非常有帮助。我通常使用大小为100的数组，因此改进后的Numba似乎是最好的选择，但我会记住并行版本以备将来参考 :) - Anthony Lethuillier

网页内容由stack overflow 提供, 点击上面的

可以查看英文原文，
原文链接

- CodeSurgeon · Accepted Answer

我基本上同意@chepner和@juanpa.arrivillaga在评论中提出的建议。Numpy是一个高性能的库，它执行的底层计算确实是用C语言编写的。此外，它的语法很干净，对于numpy数组的所有元素应用标量操作很容易。

然而，如果我们使用以下假设（并且可以容忍丑陋的代码），实际上有一种方法可以通过cython显着提高代码的性能：

您的所有数组都是一维的，因此在每个数组中迭代每个项目非常轻松。例如，我们不需要替换像numpy.dot这样的更难的numpy函数，因为您代码中的所有操作仅将标量与矩阵结合起来。
虽然在Python中使用for循环是不可想象的，但在Cython中遍历每个索引非常可行。此外，最终输出中的每个项仅取决于与该项索引对应的输入（即第0项使用u [0]，PorosityProfile [0]等）。
您不感兴趣任何中间数组，只关心在compute_python函数中返回的最终结果。因此，为所有这些中间numpy数组分配内存浪费时间吗？
使用x**y语法非常缓慢。我使用gcc编译器选项--ffast-math显着改善了这一点。我还使用了几个cython编译器指令，以避免python检查和开销。
创建numpy数组本身可能会有Python开销，因此我使用类型化的memoryviews（首选的新语法）和malloc指针的组合来创建输出数组，而不需要与Python交互太多（仅有两行，获取输出大小和返回语句在Cython注释文件中显示了显著的Python交互）。

综合考虑所有这些因素，以下是修改后的代码。它在我的笔记本电脑上比原始的Python版本快近一个数量级。

sublimation.pyx

from libc.stdlib cimport malloc, free

def compute_cython(float[:] u, float[:] porosity_profile, 
        float[:] density_ice_profile, float[:] density_dust_profile, 
        float[:] density_profile):    
    cdef:
        float dust_j, dust_f, dust_g, dust_h, dust_i
        float ice_i, ice_c, ice_d, ice_e, ice_f, ice_g, ice_h
        int size, i
        float dt, result_dust, x, dust
        float result_ice_numer, result_ice_denom, result_ice, ice
        float* out

    dust_j, dust_f, dust_g, dust_h, dust_i = \
        250.0, 633.0, 2.513, -2.2e-3, -2.8e-6
    ice_i, ice_c, ice_d, ice_e, ice_f, ice_g, ice_h = \
        273.16, 1.843e5, 1.6357e8, 3.5519e9, 1.6670e2, 6.4650e4, 1.6935e6
    size = len(u)
    out = <float *>malloc(size * sizeof(float))

    for i in range(size):
        dt = u[i] - dust_j
        result_dust = dust_f + (dust_g*dt) + (dust_h*dt**2) + (dust_i*dt**3)
        x = u[i] / ice_i
        result_ice_numer = x**3*(ice_c + ice_d*x**2 + ice_e*x**6)
        result_ice_denom = 1 + ice_f*x**2 + ice_g*x**4 + ice_h*x**8
        result_ice = result_ice_numer / result_ice_denom
        ice = density_ice_profile[i]*result_ice
        dust = density_dust_profile[i]*result_dust
        out[i] = (dust + ice)/density_profile[i]
    return <float[:size]>out

setup.py

from distutils.core import setup
from Cython.Build import cythonize
from distutils.core import Extension

def create_extension(ext_name):
    global language, libs, args, link_args
    path_parts = ext_name.split(".")
    path = "./{0}.pyx".format("/".join(path_parts))
    ext = Extension(ext_name, sources=[path], libraries=libs, language=language,
            extra_compile_args=args, extra_link_args=link_args)
    return ext

if __name__ == "__main__":
    libs = []#no external c libraries in this case
    language = "c"#chooses c rather than c++ since no c++ features were used
    args = ["-w", "-O3", "-ffast-math"]#assumes gcc is the compiler
    link_args = []#none here, could use -fopenmp for parallel code
    annotate = True#autogenerates .html files per .pyx
    directives = {#saves typing @cython decorators and applies them globally
        "boundscheck": False,
        "wraparound": False,
        "initializedcheck": False,
        "cdivision": True,
        "nonecheck": False,
    }

    ext_names = [
        "sublimation",
    ]

    extensions = [create_extension(ext_name) for ext_name in ext_names]
    setup(ext_modules = cythonize(
            extensions, 
            annotate=annotate, 
            compiler_directives=directives,
        )
    )

main.py

import numpy as np
import sublimation as sub

def compute_python(u, PorosityProfile, DensityIceProfile, DensityDustProfile, DensityProfile):
    DustJ, DustF, DustG, DustH, DustI = 250.0, 633.0, 2.513, -2.2e-3, -2.8e-6   
    IceI, IceC, IceD, IceE, IceF, IceG, IceH =  273.16, 1.843e5, 1.6357e8, 3.5519e9, 1.6670e2,  6.4650e4, 1.6935e6
    delta = u-DustJ
    result_dust = DustF+DustG*delta+DustH*delta**2+DustI*(delta**3)
    x = u/IceI
    result_ice = (x**3)*(IceC+IceD*(x**2)+IceE*(x**6))/(1+IceF*(x**2)+IceG*(x**4)+IceH*(x**8))
    return (DensityIceProfile*result_ice+DensityDustProfile*result_dust)/DensityProfile

size = 100
u = np.random.rand(size).astype(np.float32)
porosity = np.random.rand(size).astype(np.float32)
ice = np.random.rand(size).astype(np.float32)
dust = np.random.rand(size).astype(np.float32)
density = np.random.rand(size).astype(np.float32)

"""
Run these from the terminal to out the performance!
python3 -m timeit -s "from main import compute_python, u, porosity, ice, dust, density" "compute_python(u, porosity, ice, dust, density)"
python3 -m timeit -s "from main import sub, u, porosity, ice, dust, density" "sub.compute_cython(u, porosity, ice, dust, density)"
python3 -m timeit -s "import numpy as np; from main import sub, u, porosity, ice, dust, density" "np.asarray(sub.compute_cython(u, porosity, ice, dust, density))"

The first command tests the python version. (10000 loops, best of 3: 45.5 usec per loop)
The second command tests the cython version, but returns just a memoryview object. (100000 loops, best of 3: 4.63 usec per loop)
The third command tests the cython version, but converts the result to a ndarray (slower). (100000 loops, best of 3: 6.3 usec per loop)
"""

如果我对如何解释这个答案的部分有不清楚的地方，请告诉我，希望它能帮助到您！

更新1:

很遗憾，我无法让MSYS2和依赖于LLVM的numba协同工作，因此我无法进行任何直接比较。然而，在遵循@max9111的建议后，我在我的setup.py文件的args列表中添加了-march=native；然而，计时与之前没有显着差异。

从这个很棒的答案中可以看出，在numpy数组和类型内存视图之间的自动转换中存在一些开销，这在初始函数调用（如果您将结果转换回来，返回语句也是如此）中都会发生。回到使用这样的函数签名：

ctypedef np.float32_t DTYPE_t
def compute_cython_np(
        np.ndarray[DTYPE_t, ndim=1] u, 
        np.ndarray[DTYPE_t, ndim=1] porosity_profile, 
        np.ndarray[DTYPE_t, ndim=1] density_ice_profile, 
        np.ndarray[DTYPE_t, ndim=1] density_dust_profile, 
        np.ndarray[DTYPE_t, ndim=1] density_profile):

这样一来，每次调用函数的时间就会减少约1微秒，从4.6微秒降至3.6微秒，这在函数需要被多次调用时尤为显著。当然，如果你计划多次调用该函数，那么直接传入二维numpy数组可能更有效率，可以避免大量的Python函数调用开销，并分摊numpy数组->类型化内存视图转换的成本。

此外，使用numpy结构化数组可能会很有意思，因为它们可以在Cython中转换为结构化内存视图，这样可以将所有数据放得更靠近缓存，加快内存访问速度。

最后，如之前所述，以下是一个使用prange并利用并行处理的版本。请注意，这只能与类型化内存视图一起使用，因为Python的GIL必须在prange循环中释放（并且需要使用-fopenmp标志编译args和link_args）：

from cython.parallel import prange
from libc.stdlib cimport malloc, free
def compute_cython_p(float[:] u, float[:] porosity_profile, 
        float[:] density_ice_profile, float[:] density_dust_profile, 
        float[:] density_profile):    
    cdef:
        float dust_j, dust_f, dust_g, dust_h, dust_i
        float ice_i, ice_c, ice_d, ice_e, ice_f, ice_g, ice_h
        int size, i
        float dt, result_dust, x, dust
        float result_ice_numer, result_ice_denom, result_ice, ice
        float* out

    dust_j, dust_f, dust_g, dust_h, dust_i = \
        250.0, 633.0, 2.513, -2.2e-3, -2.8e-6
    ice_i, ice_c, ice_d, ice_e, ice_f, ice_g, ice_h = \
        273.16, 1.843e5, 1.6357e8, 3.5519e9, 1.6670e2, 6.4650e4, 1.6935e6
    size = len(u)
    out = <float *>malloc(size * sizeof(float))

    for i in prange(size, nogil=True):
        dt = u[i] - dust_j
        result_dust = dust_f + (dust_g*dt) + (dust_h*dt**2) + (dust_i*dt**3)
        x = u[i] / ice_i
        result_ice_numer = x**3*(ice_c + ice_d*x**2 + ice_e*x**6)
        result_ice_denom = 1 + ice_f*x**2 + ice_g*x**4 + ice_h*x**8
        result_ice = result_ice_numer / result_ice_denom
        ice = density_ice_profile[i]*result_ice
        dust = density_dust_profile[i]*result_dust
        out[i] = (dust + ice)/density_profile[i]
    return <float[:size]>out

更新2：

根据评论区 @max9111 提供的非常有用的额外建议，我将代码中所有的 float[:] 声明更改为 float[::1]。这样做的意义在于它允许数据被连续存储，而 cython 就不需要担心元素之间存在步长。这使得 SIMD 向量化成为可能，从而进一步优化了代码。以下是更新后的计时结果，使用以下命令生成：

python3 -m timeit -s "from main import compute_python, u, porosity, ice, dust, density" "compute_python(u, porosity, ice, dust, density)"
python3 -m timeit -s "import numpy as np; from main import sub, u, porosity, ice, dust, density" "np.asarray(sub.compute_cython(u, porosity, ice, dust, density))"
python3 -m timeit -s "import numpy as np; from main import sub, u, porosity, ice, dust, density" "np.asarray(sub.compute_cython_p(u, porosity, ice, dust, density))"

size = 100
python: 44.7 usec per loop
cython serial: 4.44 usec per loop
cython parallel: 111 usec per loop
cython serial contiguous: 3.83 usec per loop
cython parallel contiguous: 116 usec per loop

size = 1000
python: 167 usec per loop
cython serial: 16.4 usec per loop
cython parallel: 115 usec per loop
cython serial contiguous: 8.24 usec per loop
cython parallel contiguous: 111 usec per loop

size = 10000
python: 1.32 msec per loop
cython serial: 128 usec per loop
cython parallel: 142 usec per loop
cython serial contiguous: 55.5 usec per loop
cython parallel contiguous: 150 usec per loop

size = 100000
python: 19.5 msec per loop
cython serial: 1.21 msec per loop
cython parallel: 691 usec per loop
cython serial contiguous: 473 usec per loop
cython parallel contiguous: 274 usec per loop

size = 1000000
python: 211 msec per loop
cython serial: 12.3 msec per loop
cython parallel: 5.74 msec per loop
cython serial contiguous: 4.82 msec per loop
cython parallel contiguous: 1.99 msec per loop