在另一个答案中,我向您指出了
boost::flyweight
,现在我想更详细地研究字符串容器、Flyweight以及将四个字符与指针合并(代码中的“sillystring”类)的相对效率。
代码注释如下:
- 适用于32位Debian / squeeze,使用g ++ 4.3.3和boost 1.38.0
- 使用
std::deque
而不是std::vector
,因为向量的大小加倍行为(参见deques' chunks)会给人们留下(可以争辩的)低效印象,如果测试用例刚刚翻倍。
- “愚蠢的字符串”使用指针的LSB来区分指针用例和本地字符用例。除非您的malloc在奇数字节边界上分配(在这种情况下,代码将抛出异常)(在我的系统上肯定没有看到过这种情况; YMMV),否则它应该有效。在任何人感觉有必要指出之前,是的,我认为这是可怕的危险的hacky代码,不是轻易选择的选项。
结果因单词长度分布而异,但对于分布为“(2D6+1)/2”(峰值在4处,但长度从1到6不等)的情况,效率(定义为实际内存消耗和需要存储的实际字符数之间的比率)如下:
- 12.4%
deque<string>
- 20.9%
deque<flyweight<string> >
- 43.7%
deque<sillystring>
如果所有单词都是4个字符(在单词生成器中更改为const int length=4;
),这是sillystring的理想情况,则您将获得:
- 14.2%
deque<string>
- 77.8%
deque<flyweight<string> >
- 97.0%
deque<sillystring>
因此,轻量级模式确实是一种快速改进,但您可以通过利用单词适合指针大小的能力并避免额外的堆开销来做得更好。
以下是代码:
#include <boost/flyweight.hpp>
#include <boost/format.hpp>
#include <boost/random.hpp>
#include <cstring>
#include <deque>
#include <fstream>
#include <iostream>
#include <sstream>
#include <string>
#include <vector>
#include <sys/types.h>
#include <unistd.h>
#define THROW(X,MSG) throw X(boost::str(boost::format("%1%: %2%") % __PRETTY_FUNCTION__ % MSG))
struct random_word_generator
{
random_word_generator(uint seed)
:_rng(seed),
_length_dist(1,6),
_letter_dist('a','z'),
_random_length(_rng,_length_dist),
_random_letter(_rng,_letter_dist)
{}
std::string operator()()
{
std::string r;
const int length=(_random_length()+_random_length()+1)/2;
for (int i=0;i<length;i++) r+=static_cast<char>(_random_letter());
return r;
}
private:
boost::mt19937 _rng;
boost::uniform_int<> _length_dist;
boost::uniform_int<> _letter_dist;
boost::variate_generator<boost::mt19937&,boost::uniform_int<> >
_random_length;
boost::variate_generator<boost::mt19937&,boost::uniform_int<> >
_random_letter;
};
struct collector
{
collector(){}
virtual ~collector(){}
virtual void insert(const std::string&)
=0;
virtual void dump(const std::string&) const
=0;
};
struct string_collector
: public std::deque<std::string>,
public collector
{
void insert(const std::string& s)
{
push_back(s);
}
void dump(const std::string& f) const
{
std::ofstream out(f.c_str(),std::ios::out);
for (std::deque<std::string>::const_iterator it=begin();it!=end();it++)
out << *it << std::endl;
}
};
struct flyweight_collector
: public std::deque<boost::flyweight<std::string> >,
public collector
{
void insert(const std::string& s)
{
push_back(boost::flyweight<std::string>(s));
}
void dump(const std::string& f) const
{
std::ofstream out(f.c_str(),std::ios::out);
for (std::deque<boost::flyweight<std::string> >::const_iterator it=begin();
it!=end();
it++
)
out << *it << std::endl;
}
};
struct sillystring
{
sillystring()
{
_rep.bits=0;
}
sillystring(const std::string& s)
{
_rep.bits=0;
assign(s);
}
sillystring(const sillystring& s)
{
_rep.bits=0;
assign(s.str());
}
~sillystring()
{
if (is_ptr()) delete [] ptr();
}
sillystring& operator=(const sillystring& s)
{
assign(s.str());
}
void assign(const std::string& s)
{
if (is_ptr()) delete [] ptr();
if (s.size()>4)
{
char*const p=new char[s.size()+1];
if (reinterpret_cast<unsigned int>(p)&0x00000001)
THROW(std::logic_error,"unexpected odd-byte address returned from new");
_rep.ptr.value=(reinterpret_cast<unsigned int>(p)>>1);
_rep.ptr.is_ptr=1;
strcpy(ptr(),s.c_str());
}
else
{
_rep.txt.is_ptr=0;
_rep.txt.c0=(s.size()>0 ? validate(s[0]) : 0);
_rep.txt.c1=(s.size()>1 ? validate(s[1]) : 0);
_rep.txt.c2=(s.size()>2 ? validate(s[2]) : 0);
_rep.txt.c3=(s.size()>3 ? validate(s[3]) : 0);
}
}
std::string str() const
{
if (is_ptr())
{
return std::string(ptr());
}
else
{
std::string r;
if (_rep.txt.c0) r+=_rep.txt.c0;
if (_rep.txt.c1) r+=_rep.txt.c1;
if (_rep.txt.c2) r+=_rep.txt.c2;
if (_rep.txt.c3) r+=_rep.txt.c3;
return r;
}
}
private:
bool is_ptr() const
{
return _rep.ptr.is_ptr;
}
char* ptr()
{
if (!is_ptr())
THROW(std::logic_error,"unexpected attempt to use pointer");
return reinterpret_cast<char*>(_rep.ptr.value<<1);
}
const char* ptr() const
{
if (!is_ptr())
THROW(std::logic_error,"unexpected attempt to use pointer");
return reinterpret_cast<const char*>(_rep.ptr.value<<1);
}
static char validate(char c)
{
if (c&0x80)
THROW(std::range_error,"can only deal with 7-bit characters");
return c;
}
union
{
struct
{
unsigned int is_ptr:1;
unsigned int value:31;
} ptr;
struct
{
unsigned int is_ptr:1;
unsigned int c0:7;
unsigned int :1;
unsigned int c1:7;
unsigned int :1;
unsigned int c2:7;
unsigned int :1;
unsigned int c3:7;
} txt;
unsigned int bits;
} _rep;
};
struct sillystring_collector
: public std::deque<sillystring>,
public collector
{
void insert(const std::string& s)
{
push_back(sillystring(s));
}
void dump(const std::string& f) const
{
std::ofstream out(f.c_str(),std::ios::out);
for (std::deque<sillystring>::const_iterator it=begin();
it!=end();
it++
)
out << it->str() << std::endl;
}
};
size_t memsize()
{
const pid_t pid=getpid();
std::ostringstream cmd;
cmd << "awk '($1==\"VmData:\"){print $2,$3;}' /proc/" << pid << "/status";
FILE*const f=popen(cmd.str().c_str(),"r");
if (!f)
THROW(std::runtime_error,"popen failed");
int amount;
char units[4];
if (fscanf(f,"%d %3s",&amount,&units[0])!=2)
THROW(std::runtime_error,"fscanf failed");
if (pclose(f)!=0)
THROW(std::runtime_error,"pclose failed");
if (units[0]!='k' || units[1]!='B')
THROW(std::runtime_error,"unexpected input");
return static_cast<size_t>(amount)*static_cast<size_t>(1<<10);
}
int main(int argc,char** argv)
{
if (sizeof(void*)!=4)
THROW(std::logic_error,"64-bit not supported");
if (sizeof(sillystring)!=4)
THROW(std::logic_error,"Compiler didn't produce expected result");
if (argc!=2)
THROW(std::runtime_error,"Expected single command-line argument");
random_word_generator w(23);
std::auto_ptr<collector> c;
switch (argv[1][0])
{
case '0':
std::cout << "Testing container of strings\n";
c=std::auto_ptr<collector>(new string_collector);
break;
case '1':
std::cout << "Testing container of flyweights\n";
c=std::auto_ptr<collector>(new flyweight_collector);
break;
case '2':
std::cout << "Testing container of sillystrings\n";
c=std::auto_ptr<collector>(new sillystring_collector);
break;
default:
THROW(std::runtime_error,"Unexpected command-line argument");
}
const size_t mem0=memsize();
size_t textsize=0;
size_t filelength=0;
while (filelength<(100<<20))
{
const std::string s=w();
textsize+=s.size();
filelength+=(s.size()+1);
c->insert(s);
}
const size_t mem1=memsize();
const ptrdiff_t memused=mem1-mem0;
std::cout
<< "Memory increased by " << memused/static_cast<float>(1<<20)
<< " megabytes for " << textsize/static_cast<float>(1<<20)
<< " megabytes text; efficiency " << (100.0*textsize)/memused << "%"
<< std::endl;
return 0;
}