在这个答案中,我提出了三个优化:
- 使用
boost::container::flat_set
替换对象std::set
,以改善局部性(并且很可能减少重新分配成本,因为大多数对象集都是<4个元素)。
- 使用
string_atom
替换对象idstd::string
(在技术上是char const*
但在逻辑上是唯一的)。这种技术类似于各种VM实现(如Java/.NET)中的内部字符串。
- 使用
equal_range
调用替换映射交集,通过避免复制(大量的)数据来节省大量时间。
更新: 在下面的最终版本中,将boost::container::flat_map
简单地替换回std::set
会降低性能,仅find_range
的因子约为2倍,而find_range_ex
则约为4倍,在我的测试系统上。
注意: make_atom
的当前实现过于简单,并且永远不会释放原子。您可能希望将字符串备份到deque中,使用Boost Flyweights、池分配器或其中某些组合使其更智能化。但是,是否需要这样做取决于您的用例。
更新: 当仅应用此优化时,速度提升已经达到了26~30倍。请注意,内存使用量大约是另外两个优化的两倍,为20MiB左右。
容量和数据效率
在开始之前,我想了解数据的样式。因此,编写一些代码来解析 bmap.txt
输入,我们可以得到:
在 Coliru 上查看源代码
Parsed ok
Histogram of 66425 input lines
d: 3367
f: 20613
p: 21222
v: 21223
ranges size: 6442450944
ranges iterative size: 21223
Min object set: 1.000000
Max object set: 234.000000
Average object set: 3.129859
Min interval width: 1024.000000
Max interval width: 2526265344.000000
Average interval width: 296.445177k
First: [0,1048576)
Last: [3916185600,6442450944)
String atoms: 23904 unique in 66425 total
Atom efficiency: 35.986451%
正如您所看到的,这些集合通常包含约3个项目,并且有许多重复项。
使用make_atom
对象命名方法和boost::flat_set
可以将内存分配从~15GiB降至~10Gib。
此优化还将字符串比较减少为指针比较,用于集合插入和interval_map
的组合策略,因此对于更大的数据集,这具有潜在的加速作用。
查询效率
部分复制输入确实严重影响了查询性能。
不要复制,而是通过替换来查看重叠范围:
const ranges r = *map & window;
ranges::const_iterator iter = r.begin ();
while (iter != r.end ()) {
与
auto r = map->equal_range(window)
ranges::const_iterator iter = r.first
while (iter != r.second) {
在我的系统上,使用两个版本运行10000个相同的随机查询,结果表明速度提升了32倍。
10000 'random' OLD lookups resulted in 156729884 callbacks in 29148ms
10000 'random' NEW lookups resulted in 156729884 callbacks in 897ms
real 0m31.715s
user 0m31.664s
sys 0m0.012s
现在运行时主要由解析/统计控制。完整的基准测试代码在这里:On Coliru
#define BOOST_RESULT_OF_USE_DECTYPE
#define BOOST_SPIRIT_USE_PHOENIX_V3
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <inttypes.h>
#include <assert.h>
#include <boost/icl/interval.hpp>
#include <boost/icl/interval_set.hpp>
#include <boost/icl/interval_map.hpp>
#include <boost/container/flat_set.hpp>
using namespace std;
static size_t atoms_requested = 0;
static size_t atoms_unique_created = 0;
using string_atom = const char*;
string_atom make_atom(std::string&& s)
{
static std::set<std::string> s_atoms;
atoms_requested += 1;
auto it = s_atoms.find(s);
if (it != s_atoms.end())
return it->c_str();
atoms_unique_created += 1;
return s_atoms.insert(std::move(s)).first->c_str();
}
typedef boost::container::flat_set<string_atom> objects;
typedef boost::icl::interval_map<uint64_t, objects> ranges;
ranges*
new_ranges (void)
{
return new ranges ();
}
void
free_ranges (ranges *mapv)
{
ranges *map = (ranges *) mapv;
delete map;
}
extern "C" void
insert_range (void *mapv, uint64_t start, uint64_t end, const char *object)
{
ranges *map = (ranges *) mapv;
objects obj_set;
obj_set.insert (obj_set.end(), object);
map->add (std::make_pair (boost::icl::interval<uint64_t>::right_open (start, end),
obj_set));
}
extern "C" void
iter_range (void *mapv, void (*f) (uint64_t start, uint64_t end, const char *object, void *opaque), void *opaque)
{
ranges *map = (ranges *) mapv;
ranges::iterator iter = map->begin ();
while (iter != map->end ()) {
boost::icl::interval<uint64_t>::type range = iter->first;
uint64_t start = range.lower ();
uint64_t end = range.upper ();
objects obj_set = iter->second;
objects::iterator iter2 = obj_set.begin ();
while (iter2 != obj_set.end ()) {
f (start, end, *iter2, opaque);
iter2++;
}
iter++;
}
}
extern "C" void
find_range (void const *mapv, uint64_t start, uint64_t end, void (*f) (uint64_t start, uint64_t end, const char *object, void *opaque), void *opaque)
{
const ranges *map = (const ranges *) mapv;
boost::icl::interval<uint64_t>::type window;
window = boost::icl::interval<uint64_t>::right_open (start, end);
const ranges r = *map & window;
ranges::const_iterator iter = r.begin ();
while (iter != r.end ()) {
boost::icl::interval<uint64_t>::type range = iter->first;
uint64_t start = range.lower ();
uint64_t end = range.upper ();
objects obj_set = iter->second;
objects::iterator iter2 = obj_set.begin ();
while (iter2 != obj_set.end ()) {
f (start, end, *iter2, opaque);
iter2++;
}
iter++;
}
}
extern "C" void
find_range_ex (void const *mapv, uint64_t start, uint64_t end, void (*f) (uint64_t start, uint64_t end, const char *object, void *opaque), void *opaque)
{
const ranges *map = (const ranges *) mapv;
boost::icl::interval<uint64_t>::type window;
window = boost::icl::interval<uint64_t>::right_open (start, end);
#if 0
const ranges r = *map & window;
ranges::const_iterator iter = r.begin ();
while (iter != r.end ()) {
#else
auto r = map->equal_range(window);
ranges::const_iterator iter = r.first;
while (iter != r.second) {
#endif
boost::icl::interval<uint64_t>::type range = iter->first;
uint64_t start = range.lower ();
uint64_t end = range.upper ();
objects obj_set = iter->second;
objects::iterator iter2 = obj_set.begin ();
while (iter2 != obj_set.end ()) {
f (start, end, *iter2, opaque);
iter2++;
}
iter++;
}
}
#include <memory>
#include <boost/spirit/include/qi.hpp>
#include <boost/spirit/include/phoenix.hpp>
#include <boost/accumulators/accumulators.hpp>
#include <boost/accumulators/statistics.hpp>
#include <fstream>
#include <chrono>
std::map<char, size_t> histo;
bool insert_line_of_input(ranges& bmap_data, uint64_t b, uint64_t e, char type, std::string& object) {
if (!object.empty())
histo[type]++;
#if 0
object.insert(object.begin(), ':');
object.insert(object.begin(), type);
#endif
insert_range(&bmap_data, b, e, make_atom(std::move(object)));
return true;
}
std::vector<std::pair<uint64_t, uint64_t> > generate_test_queries(ranges const& bmap_data, size_t n) {
std::vector<std::pair<uint64_t, uint64_t> > queries;
queries.reserve(n);
for (size_t i = 0; i < n; ++i)
{
auto start = (static_cast<uint64_t>(rand()) * rand()) % bmap_data.size();
auto end = start + rand();
queries.emplace_back(start,end);
}
return queries;
}
ranges read_mapfile(const char* fname) {
std::ifstream ifs(fname);
boost::spirit::istream_iterator f(ifs >> std::noskipws), l;
ranges bmap_data;
namespace phx = boost::phoenix;
using namespace boost::spirit::qi;
uint_parser<uint64_t, 16> offset;
if (!phrase_parse(f,l,
("1 " >> offset >> offset >> char_("pvdf") >> as_string[lexeme[+graph] >> attr('/') >> lexeme[*~char_("\r\n")]])
[ _pass = phx::bind(insert_line_of_input, phx::ref(bmap_data), _1, _2, _3, _4) ] % eol >> *eol,
blank))
{
exit(255);
} else
{
std::cout << "Parsed ok\n";
}
if (f!=l)
std::cout << "Warning: remaining input '" << std::string(f,l) << "\n";
return bmap_data;
}
void report_statistics(ranges const& bmap_data) {
size_t total = 0;
for (auto e : histo) total += e.second;
std::cout << "Histogram of " << total << " input lines\n";
for (auto e : histo)
std::cout << e.first << ": " << e.second << "\n";
namespace ba = boost::accumulators;
ba::accumulator_set<double, ba::stats<ba::tag::mean, ba::tag::max, ba::tag::min> >
object_sets, interval_widths;
for (auto const& r : bmap_data)
{
auto width = r.first.upper() - r.first.lower();
assert(width % 1024 == 0);
interval_widths(width);
object_sets(r.second.size());
}
std::cout << std::fixed;
std::cout << "ranges size: " << bmap_data.size() << "\n";
std::cout << "ranges iterative size: " << bmap_data.iterative_size() << "\n";
std::cout << "Min object set: " << ba::min(object_sets) << "\n" ;
std::cout << "Max object set: " << ba::max(object_sets) << "\n" ;
std::cout << "Average object set: " << ba::mean(object_sets) << "\n" ;
std::cout << "Min interval width: " << ba::min(interval_widths) << "\n" ;
std::cout << "Max interval width: " << ba::max(interval_widths) << "\n" ;
std::cout << "Average interval width: " << ba::mean(interval_widths)/1024.0 << "k\n" ;
std::cout << "First: " << bmap_data.begin()->first << "\n" ;
std::cout << "Last: " << bmap_data.rbegin()->first << "\n" ;
std::cout << "String atoms: " << atoms_unique_created << " unique in " << atoms_requested << " total\n";
std::cout << "Atom efficiency: " << (atoms_unique_created*100.0/atoms_requested) << "%\n";
}
void perform_comparative_benchmarks(ranges const& bmap_data, size_t number_of_queries) {
srand(42);
auto const queries = generate_test_queries(bmap_data, number_of_queries);
using hrc = std::chrono::high_resolution_clock;
{
auto start = hrc::now();
size_t callbacks = 0;
for (auto const& q: queries)
{
find_range(&bmap_data, q.first, q.second,
[](uint64_t start, uint64_t end, const char *object, void *opaque) {
++(*static_cast<size_t*>(opaque));
}, &callbacks);
}
std::cout << number_of_queries << " 'random' OLD lookups resulted in " << callbacks
<< " callbacks in " << std::chrono::duration_cast<std::chrono::milliseconds>((hrc::now()-start)).count() << "ms\n";
}
{
auto start = hrc::now();
size_t callbacks = 0;
for (auto const& q: queries)
{
find_range_ex(&bmap_data, q.first, q.second,
[](uint64_t start, uint64_t end, const char *object, void *opaque) {
++(*static_cast<size_t*>(opaque));
}, &callbacks);
}
std::cout << number_of_queries << " 'random' NEW lookups resulted in " << callbacks
<< " callbacks in " << std::chrono::duration_cast<std::chrono::milliseconds>((hrc::now()-start)).count() << "ms\n";
}
}
int main() {
auto bmap = read_mapfile("bmap.txt");
report_statistics(bmap);
perform_comparative_benchmarks(bmap, 1000);
#if 0
for (auto const& r : bmap)
{
std::cout << r.first << "\t" << r.second.size() << "\t";
std::copy(r.second.begin(), r.second.end(), std::ostream_iterator<std::string>(std::cout, "\t"));
std::cout << "\n";
}
#endif
}
boost::icl::interval<uint64_t>::type range = iter->first;
将一个集合变成范围? - Surtmap & window
)创建了另一个区间映射(r
)。迭代遍历这个映射。iter->first
返回迭代器的第一部分(==键==uint64_t
区间)。 - Rich