create table tweets ( created_at timestamp, id string primary key, retweeted boolean, source string INDEX using fulltext, text string INDEX using fulltext, user_id string );
$ head data/books/books 0|6-20386-216-4|STUDY-AIDS|1998-05-31|Gakken|166.99 1|0-60558-466-8|JUVENILE-NONFICTION|1975-02-12|Holtzbrinck|128.99 2|3-16551-636-9|POETRY|1988-01-24|Oxford University Press|155.99 3|4-75505-741-2|COMICS-GRAPHIC-NOVELS|1992-02-24|Saraiva|101.99 4|3-32982-589-8|PERFORMING-ARTS|2011-03-09|Cambridge University Press|183.99
基础命令:
1 2 3 4
./crash --help ./crash --hosts node1 --sysinfo
./crash --hosts node1 -c "show tables"
创建表结构,导入数据。
1 2 3 4 5 6 7 8 9 10
CREATE TABLE books ( id integer, isbn string, category string, publish_date string, publisher string, price float );
COPY books FROM '/disk01/data/books/books.json';
通过如上命令,可以生成不同级别大小的测试数据,根据参数可以生成不同大小的表。
测试场景1
表级别:千万级
效率:15m/s (node)
JSON大小:6.6 g
入库CrateDB大小:5.2 g
数据量:47582950
分片数量:6
副本数:2
memory:16g
vcpu:24
storage:1.4T (4*280g)
network:千兆
主要针对单表的查询测试。
1 2 3 4 5 6 7 8 9 10 11
select category,count(*) from books group by category limit 100; -- 3.137 s
select category,count(*) as num from books group by category order by num limit 100; --2.929 sec
select category,count(*) as num from books where category='SCIENCE' group by category order by num limit 100; --0.143 sec
select count(*) from books where category='SCIENCE' limit 100; -- 0.022 sec
select count(distinct category) from books limit 100; -- 2.990 sec
select distinct category from books limit 100; -- 3.032 sec
修改 number_of_shards 看是否提升性能
1 2 3 4 5
ALTER TABLE books SET (number_of_shards = 48)
OPTIMIZE table books; -- 这个参数比较有用,可以提升性能
SELECT count(*) as num_shards, sum(num_docs) as num_docs FROM sys.shards WHERE schema_name = 'doc' AND table_name = 'books';
测试场景2
表级别:亿级表
效率:17285.888 sec
JSON大小:33g
入库CrateDB大小:27g
数据量:235265838
node_num: 3
分片数量:1024
副本数:2
memory:100g
vcpu:24
storage:1.4T (4*280g)
每入库秒钟:13610 条/s
network:千兆
创建表,导入数据。
1 2 3 4 5 6 7 8 9 10
CREATE TABLE books_t1 ( id integer, isbn string, category string INDEX using fulltext, publish_date string, publisher string INDEX using fulltext, price float ) CLUSTERED BY (category) INTO 1024 SHARDS with (number_of_replicas = 2, refresh_interval=10000);
select category,count(*) from books group by category limit 100; -- 37.878 sec
select category,count(*) as num from books group by category order by num limit 100; -- 46.603 sec
select category,count(*) as num from books where category='SCIENCE' group by category order by num limit 100; -- 11.808 sec
select count(*) from books where category='SCIENCE' limit 100; -- 0.002 sec
select count(distinct category) from books limit 100; -- 44.924 sec
select distinct category from books limit 100; -- 44.335 sec
select id,price,publisher from books where publish_date='1999-02-02' and category='SCIENCE' limit 100; -- 0.347 sec
select price,count(publisher) from books where publish_date='1999-02-02' and category='SCIENCE' group by price order by price desc limit 100; -- 0.981 sec
select price,category from books where publisher='Kyowon' group by price,category order by price limit 100; -- 3.602 sec
select price,category,count(*) from books where publisher='Kyowon' group by price,category order by price limit 100; -- 1.406 sec