In this blog I will examine how the number of rows when INSERT multiple ones impacts timing.
The PostgreSQL database "INSERT multiple rows" approach means that a single PostgreSQL INSERT statement inserts multiple rows into a table.
The approach has the following syntax:
INSERT INTO table_name (column_list)
VALUES
(value_list_1),
(value_list_2),
...
(value_list_n);
Assume I have a table:
create table my_table(a varchar(100), b timestamp, c bigint);
postgres=# create table my_table(a varchar(100), b timestamp, c bigint);
CREATE TABLE
postgres=# \d my_table
Table "public.my_table"
Column | Type | Collation | Nullable | Default
--------+-----------------------------+-----------+----------+---------
a | character varying(100) | | |
b | timestamp without time zone | | |
c | bigint | | |
postgres=#
To populate it using INSERT multiple rows I will use PostgreSQL function.
It gets 2 input parameters:
- n_of_recs - number of records to insert in a table
- chunk_size - number of rows inserted at once using the INSERT multiple rows approach
create or replace function populate_table(n_of_recs bigint, chunk_size smallint)
returns bigint
language plpgsql
as
$$
declare
time_ms bigint := 0;
random_varchar_length smallint;
random_varchar varchar(100);
random_timestamp timestamp;
random_int bigint;
query text;
start_ts timestamp;
end_ts timestamp;
begin
if chunk_size <= 0 then
raise exception 'The chunk_size should be positive.';
end if;
select clock_timestamp() into start_ts;
query := 'insert into my_table values ';
for idx in 1..n_of_recs loop
-- some random varchar length between 1 and 100
random_varchar_length := floor(random()*(100-1+1))+1;
-- some random varchar
random_varchar := array_to_string(array(select chr((ascii('a') + round(random() * 25)) :: integer) from generate_series(1,random_varchar_length)), '');
-- some random int between 55 and 777777777
random_int := floor(random()*(777777777-55+1))+55;
-- some random timestamp between '1799-01-01 00:00:00' and '2080-01-01 00:00:00'
random_timestamp := timestamp '1799-01-01 00:00:00' + random() * (timestamp '2080-01-01 00:00:00' - timestamp '1799-01-01 00:00:00');
if (idx = n_of_recs) or (idx % chunk_size = 0) then
query := query || format('(''%s'', ''%s'', %s);', random_varchar, random_timestamp, random_int);
-- raise notice 'Flash. Populated Total: % recs', idx;
execute query;
query := 'insert into my_table values ';
else
query := query || format('(''%s'', ''%s'', %s), ', random_varchar, random_timestamp, random_int);
end if;
end loop;
select clock_timestamp() into end_ts;
select
round ((
(extract(epoch from end_ts) -
extract(epoch from start_ts))) * 1000)
into time_ms;
return time_ms;
end;
$$;
CREATE FUNCTION
postgres=# \df populate_table
List of functions
Schema | Name | Result data type | Argument data types | Type
--------+----------------+------------------+---------------------------------------+------
public | populate_table | bigint | n_of_recs bigint, chunk_size smallint | func
(1 row)
postgres=#
Now let's examine how timing of the inserting data in a table using the INSERT multiple rows approach depends on the number of rows inserted at once.
postgres=# create table check_the_timing(n_of_recs bigint, chunk_size smallint, timing_ms bigint);
CREATE TABLE
postgres=# \d check_the_timing
Table "public.check_the_timing"
Column | Type | Collation | Nullable | Default
------------+----------+-----------+----------+---------
n_of_recs | bigint | | |
chunk_size | smallint | | |
timing_ms | bigint | | |
Let's check how long does it take to insert 100000 rows into the my_table using the INSERT multiple rows approach, each insert has 100 values flashed at once.
postgres=# select populate_table(100000::bigint, 100::smallint)::bigint timing_ms;
timing_ms
-----------
5734
(1 row)
postgres=#
According to the output it takes 5734 milliseconds.
Let's examine how long does it take to insert 200000 rows into the my_table using the INSERT multiple rows approach.
Each insert has values flashed at once, where the iterates from 5000 to 20000 by step 2500.
For the purity of the experiment, I will empty the table my_table on every probe.
do
$do$
declare
n_chunk_size smallint := 20000;
begin
for n_rows_in_a_single_insert in 5000..n_chunk_size by 2500 loop
truncate table my_table;
with n_of_recs as (select 200000::bigint val),
chunk_size as (select n_rows_in_a_single_insert val),
timing_ms as (select populate_table(n_of_recs.val::bigint, chunk_size.val::smallint)::bigint val from n_of_recs, chunk_size)
insert into check_the_timing select n_of_recs.val, chunk_size.val, timing_ms.val
from n_of_recs, chunk_size, timing_ms;
end loop;
end;
$do$;
Let's analyze obtained results:
select n_of_recs, chunk_size, timing_ms
from check_the_timing
order by 1, 2;
postgres=# select n_of_recs, chunk_size, timing_ms
from check_the_timing
order by 1, 2;
n_of_recs | chunk_size | timing_ms
-----------+------------+-----------
200000 | 5000 | 17575
200000 | 7500 | 20748
200000 | 10000 | 26458
200000 | 12500 | 29194
200000 | 15000 | 31886
200000 | 17500 | 35876
200000 | 20000 | 44726
(7 rows)
Let's present the data as a chart:
Another example.
How long does it take to insert 1000000 rows into the my_table using the INSERT multiple rows approach?
postgres=# create table check_the_timing(n_of_recs bigint, chunk_size bigint, timing_ms bigint);
CREATE TABLE
postgres=# \d check_the_timing
Table "public.check_the_timing"
Column | Type | Collation | Nullable | Default
------------+--------+-----------+----------+---------
n_of_recs | bigint | | |
chunk_size | bigint | | |
timing_ms | bigint | | |
postgres=#
create or replace function populate_table(n_of_recs bigint, chunk_size bigint)
returns bigint
language plpgsql
as
$$
declare
time_ms bigint := 0;
random_varchar_length bigint;
random_varchar varchar(100);
random_timestamp timestamp;
random_int bigint;
query text;
start_ts timestamp;
end_ts timestamp;
begin
if chunk_size <= 0 then
raise exception 'The chunk_size should be positive.';
end if;
select clock_timestamp() into start_ts;
query := 'insert into my_table values ';
for idx in 1..n_of_recs loop
-- some random varchar length between 1 and 100
random_varchar_length := floor(random()*(100-1+1))+1;
-- some random varchar
random_varchar := array_to_string(array(select chr((ascii('a') + round(random() * 25)) :: integer) from generate_series(1,random_varchar_length)), '');
-- some random int between 55 and 777777777
random_int := floor(random()*(777777777-55+1))+55;
-- some random timestamp between '1799-01-01 00:00:00' and '2080-01-01 00:00:00'
random_timestamp := timestamp '1799-01-01 00:00:00' + random() * (timestamp '2080-01-01 00:00:00' - timestamp '1799-01-01 00:00:00');
if (idx = n_of_recs) or (idx % chunk_size = 0) then
query := query || format('(''%s'', ''%s'', %s);', random_varchar, random_timestamp, random_int);
raise notice 'Flash. Populated Total: % recs', idx;
execute query;
query := 'insert into my_table values ';
else
query := query || format('(''%s'', ''%s'', %s), ', random_varchar, random_timestamp, random_int);
end if;
end loop;
select clock_timestamp() into end_ts;
select
round ((
(extract(epoch from end_ts) -
extract(epoch from start_ts))) * 1000)
into time_ms;
return time_ms;
end;
$$;
postgres=# \df populate_table
List of functions
Schema | Name | Result data type | Argument data types | Type
--------+----------------+------------------+-------------------------------------+------
public | populate_table | bigint | n_of_recs bigint, chunk_size bigint | func
(1 row)
postgres=#
Each insert has values flashed at once, where the iterates from 2500 to 40000 by step 5000.
For the purity of the experiment, I will empty the table my_table on every probe.
do
$do$
begin
for n_rows_in_a_single_insert in 2500..40000 by 5000 loop
raise notice 'n_rows_in_a_single_insert: %', n_rows_in_a_single_insert;
truncate table my_table;
with n_of_recs as (select 1000000::bigint val),
chunk_size as (select n_rows_in_a_single_insert val),
timing_ms as (select populate_table(n_of_recs.val::bigint, chunk_size.val::bigint)::bigint val from n_of_recs, chunk_size)
insert into check_the_timing select n_of_recs.val, chunk_size.val, timing_ms.val
from n_of_recs, chunk_size, timing_ms;
end loop;
end;
$do$;
Let's analyze the obtained results:
select n_of_recs, chunk_size, timing_ms
from check_the_timing
order by 1, 2;
postgres=# select n_of_recs, chunk_size, timing_ms
from check_the_timing
order by 1, 2;
n_of_recs | chunk_size | timing_ms
-----------+------------+-----------
1000000 | 2500 | 75140
1000000 | 7500 | 105269
1000000 | 12500 | 143569
1000000 | 17500 | 189686
1000000 | 22500 | 235968
1000000 | 27500 | 307523
1000000 | 32500 | 399844
1000000 | 37500 | 503795
(8 rows)
postgres=#
Conclusion:
In this blog I demonstrated the approach to examine how the number of rows when INSERT multiple ones together has an impact on timing.