SDL软件渲染比硬件渲染更快

3
我刚刚发现SDL有一个奇怪的行为。
我编写了一个简单的粒子渲染器,但由于某些原因,使用软件渲染器比硬件渲染器运行速度快了大约6倍。
以下是源代码:
main.cpp
#define _USE_MATH_DEFINES
#include <iostream>
#include <cstdlib>
#include <Windows.h>
#include <vector>
#include <math.h>
#include <time.h>
#include <SDL.h>

#include "Particle.h"

const int SCREEN_WIDTH = 1024;
const int SCREEN_HEIGHT = 600;
const int PARTICLE_NUMBER = 50000;
const int MAX_SPEED = 200;
const int MIN_SPEED = 5;

long long getMs (void) {
    SYSTEMTIME stime;
    GetLocalTime(&stime);
    long long ms = stime.wMilliseconds +
        stime.wSecond * 1000 +
        stime.wMinute * 60000 +
        stime.wHour * 3600000 +
        stime.wDay * 86400000 +
        stime.wMonth * 2592000000 +
        (stime.wYear - 1970) * 31104000000;
    return ms;
}

int main(int argc, char *argv[])
{
    bool hardwareAccelerated = true;

    if (argc == 2)
    {
        if (strncmp(argv[1], "-software", 9) == 0)
        {
            hardwareAccelerated = false;
        }
    }

    char title [100];
    sprintf(title, "Particles: %d - (%s)", PARTICLE_NUMBER, (hardwareAccelerated ? "HARDWARE ACCELERATED" : "SOFTWARE RENDERING"));

    Particle<double> *particles = (Particle<double>*) malloc(sizeof(Particle<double>) * PARTICLE_NUMBER);

    for (int i = 0; i < PARTICLE_NUMBER; i++)
    {
        double x = rand() % SCREEN_WIDTH;
        double y = rand() % SCREEN_HEIGHT;
        double direction = (((double) rand() / (double) RAND_MAX) - 0.5f) * 2 * M_PI;
        double speed = rand() % (MAX_SPEED - MIN_SPEED) + MIN_SPEED;
        (particles+i)->setPos(x, y);
        (particles+i)->setDirection(direction);
        (particles+i)->setSpeed(speed);
        // std::cout << (particles+i) << std::endl;
    }



    if (SDL_Init(SDL_INIT_EVERYTHING) != 0) {
        return 1;
    }

    SDL_Window *window = SDL_CreateWindow(title,
        SDL_WINDOWPOS_CENTERED, SDL_WINDOWPOS_CENTERED,
        SCREEN_WIDTH, SCREEN_HEIGHT, SDL_WINDOW_SHOWN);
    if (window == nullptr) {
        return 2;
    }

    SDL_RendererFlags flags = (hardwareAccelerated ? SDL_RENDERER_ACCELERATED : SDL_RENDERER_SOFTWARE);
    SDL_Renderer *renderer = SDL_CreateRenderer(window, -1,
        flags);
    if (renderer == nullptr) {
        return 3;
    }

    bool quit = false;
    SDL_Event evt;

    long long lastFrame = getMs();
    double delta = 0.f;
    while (!quit)
    {
        long long currentTime = getMs();
        delta = currentTime - lastFrame;
        lastFrame = currentTime;

        std::cout << "delta: " << delta << std::endl;

        while(SDL_PollEvent(&evt) != 0)
        {
            if (evt.type == SDL_QUIT)
            {
                quit = true;
            }
        }
        SDL_SetRenderDrawColor(renderer, 0,0,0,1);
        SDL_RenderClear(renderer);
        SDL_SetRenderDrawColor(renderer, 255,0,0,255);
        for (int i = 0; i < PARTICLE_NUMBER; i++)
        {
            (particles+i)->tick(delta);
            double *pos = (particles+i)->getPos();
            SDL_RenderDrawPoint(renderer, pos[0], pos[1]);
        }
        SDL_RenderPresent(renderer);
    }

    SDL_DestroyRenderer(renderer);
    SDL_DestroyWindow(window);
    SDL_Quit();

    return 0;
}

particle.h

#ifndef _H_PARTICLE
#define _H_PARTICLE
#include <math.h>

template <class T>
class Particle
{
public:
    Particle(void);

    void tick(double);

    void setPos(T, T);
    T* getPos(void);
    void setDirection(double);
    double getDirection(void);
    void setSpeed(T);
    T getSpeed(void);
    ~Particle(void);
private:
    T x;
    T y;
    T speed;
    double direction;
};

template <class T>
Particle<T>::Particle(void)
{
}

template <class T>
void Particle<T>::tick(double delta)
{
    double dt = delta / 1000;
    T d_speed = this->speed * dt;
    // std::cout << d_speed << std::endl;

    this->x += cos(this->direction) * d_speed;
    this->y += sin(this->direction) * d_speed;

    if (this->x > SCREEN_WIDTH) this->x = 0;
    if (this->y > SCREEN_HEIGHT) this->y = 0;
    if (this->x < 0) this->x = SCREEN_WIDTH;
    if (this->y < 0) this->y = SCREEN_HEIGHT;
}

template <class T>
void Particle<T>::setPos(T x, T y)
{
    this->x = x;
    this->y = y;
}

template <class T>
T* Particle<T>::getPos(void)
{
    T pos[2];
    pos[0] = this->x;
    pos[1] = this->y;
    return pos;
}

template <class T>
void Particle<T>::setDirection(double direction)
{
    this->direction = direction;
}

template <class T>
double Particle<T>::getDirection(void)
{
    return this->direction;
}

template <class T>
void Particle<T>::setSpeed(T speed)
{
    this->speed = speed;
}

template <class T>
T Particle<T>::getSpeed(void)
{
    return this->speed;
}

template <class T>
Particle<T>::~Particle(void)
{
}

#endif

为什么会这样呢?硬件渲染难道不应该比软件渲染快得多吗?

1
你可以尝试使用SDL_RenderDrawPoints,看看是否有所改善。虽然我不熟悉SDL,但批处理通常比单个绘制调用更好。 - Retired Ninja
这意味着每个时刻都要转换50000个对象。我并不认为这是问题所在,难道它不会减慢软件和硬件渲染器的速度吗? - rrrrrrrrrrrrrrrr
1
SDL_RenderDrawPoint 调用 SDL_RenderDrawPoints(但计数为1)。 - patchwork
2
我添加了简单的批处理,软件中的差异从约12减少到约8,在硬件中从约32减少到约8。 - Retired Ninja
你是对的,我很抱歉。 - rrrrrrrrrrrrrrrr
1个回答

1

SDL_RenderDrawPoint() 调用 SDL_RenderDrawPoints(),但计数为 1。 SDL_RenderDrawPoints() 在渲染所需点数之前调用 SDL_stack_alloc(),并在完成后调用 SDL_stack_free()。这可能是你的问题。你正在为系统中的每个粒子在每帧中进行一次 mallocfree

我认为 Retired Ninja 的想法是正确的 - 使用 SDL_RenderDrawPoints(),并且每帧只执行一次 mallocfree

或者 - 使用不同的范例。创建一个 SDL_Surface。每帧,通过直接操作 SDL_Surface 的像素内存(对特定像素执行 SDL_MapRGB()),将所有需要的像素绘制出来,然后在渲染时将 SDL_Surface 转换为 SDL_Texture 并呈现给渲染器。

如果一个名为Particle的类包含指向SDL_Surface的指针,那么你可以有一个类似下面这样的绘制函数的示例代码:
void Particle::draw()
{
  Uint32 x = m_position.getX();
  Uint32 y = m_position.getY();
  Uint32 * pixel = (Uint32*)m_screen->pixels+(y*(m_pitch/4))+x;

  Uint8 r1 = 0;
  Uint8 g1 = 0;
  Uint8 b1 = 0;
  Uint8 a1 = 0;
  GFX_RGBA_FROM_PIXEL(*pixel, m_screen->format, &r1, &g1, &b1, &a1);

  Uint32 * p = (Uint32*)m_screen->pixels+(y*(m_pitch/4))+x;
  *p = SDL_MapRGB(m_screen->format, m_r, m_g, m_b);
}

这里的GFX_RGBA_FROM_PIXEL是从Andreas Schiffler的SDL2_gfx库中借用的,并被定义为:

///////////////////////////////////////////////////////////////////
void GFX_RGBA_FROM_PIXEL(Uint32 pixel, SDL_PixelFormat * fmt, Uint8* r, Uint8* g, Uint8* b, Uint8* a)
{
  *r = ((pixel&fmt->Rmask)>>fmt->Rshift)<<fmt->Rloss;
  *g = ((pixel&fmt->Gmask)>>fmt->Gshift)<<fmt->Gloss;
  *b = ((pixel&fmt->Bmask)>>fmt->Bshift)<<fmt->Bloss;
  *a = ((pixel&fmt->Amask)>>fmt->Ashift)<<fmt->Aloss;
}

它可能会更快。我没有做过任何时间测试,但这可能是值得的,因为你直接操作像素内存的颜色,然后每帧简单地进行 blit。你不需要进行任何mallocfree


网页内容由stack overflow 提供, 点击上面的
可以查看英文原文,
原文链接