Start Postgres DB with docker
$ docker run -d -p 5432:5432 -e POSTGRES_PASSWORD=pass \
-v pgdata:/data/postgres -v user_data:/user_data docker.io/postgres
-- To Connect
$ psql postgres://postgres:pass@localhost:5432
drop domain if exists address_num_type cascade;
create domain address_num_type AS integer
check (VALUE > 0 and VALUE < 10000);
drop domain if exists cep_type cascade;
create domain cep_type AS TEXT
check(
VALUE ~ '^\d{7,8}$'
OR VALUE ~ '^\d{5}-\d{3}$'
);
drop table ADDRESS;
create table ADDRESS(
id serial primary key,
num address_num_type not null,
cep cep_type not null
);
insert into ADDRESS values (default, 20, '50000000');
-- Done
insert into ADDRESS values (default, -1, '50000000');
-- SQL Error [23514]: ERROR: value for domain address_num_type violates check
-- constraint "address_num_type_check"
insert into ADDRESS values (default, 20, '500');
-- SQL Error [23514]: ERROR: value for domain cep_type violates check
-- constraint "cep_type_check"
Equivalent to MERGE.
ON CONFLICT (name): Specifies the unique constraint or index to check for conflicts.
DO:
EXCLUDED
table;
EXCLUDED: A special table reference representing the values that were attempted to be inserted.
SQL Error [23505]: ERROR: duplicate key value violates unique constraint “foo” Detail: Key (bar)=(val) already exists.
CREATE TEMPORARY TABLE chcp_users (
id SERIAL PRIMARY KEY,
cpf TEXT UNIQUE,
name text,
age numeric
);
INSERT INTO chcp_users (cpf, name, age)
VALUES ('11122233345', 'Carlos Pena', 27)
ON CONFLICT (cpf)
DO UPDATE SET
name = EXCLUDED.name,
age = EXCLUDED.age;
select * from chcp_users
-- 1 11122233345 Carlos Pena 27
INSERT INTO chcp_users (cpf, name, age)
VALUES ('11122233345', 'Carlos H C Pena', 28)
ON CONFLICT (cpf)
DO UPDATE SET
name = EXCLUDED.name,
age = EXCLUDED.age;
select * from chcp_users
-- 1 11122233345 Carlos H C Pena 28
INSERT INTO chcp_users (cpf, name, age)
VALUES ('11122233345', 'AAAAAAAAA', -1)
ON CONFLICT (cpf)
DO nothing
-- Updated Rows: 0 (error has been suppressed)
select * from chcp_users
-- 1 11122233345 Carlos H C Pena 28
COST
is not everythingCREATE TABLE example_table (
start_num NUMERIC,
end_num NUMERIC,
text_column text
);
CREATE OR REPLACE PROCEDURE populate_table(
in_count INTEGER
)
LANGUAGE plpgsql
AS $$
DECLARE
i INTEGER := 1;
random_st INTEGER := 0;
random_range INTEGER := 0;
BEGIN
WHILE i <= in_count loop
random_st := random() * 1000;
random_range := random() * 1000;
INSERT INTO example_table (start_num, end_num, text_column)
VALUES (random_st, random_st+random_range, 'Row ' || i);
i := i + 1;
END LOOP;
END;
$$;
CALL populate_table(2000000);
select * from example_table
where
100 >= start_num and
100 <= end_num;
-- COST 0.00 - 42739.00 -- Seq Scan
create index idx_example_table_start on example_table(start_num);
create index idx_example_table_end on example_table(end_num);
create index idx_example_table_start_end on example_table(start_num, end_num desc);
select * from example_table
where
100 >= start_num and
100 <= end_num;
-- COST 3798.16 - 19575.96 -- Bitmap Heap Scan (idx_example_table_start)
select * from example_table
where
100 between start_num and end_num;
-- COST 3798.16 - 19575.96 -- Bitmap Heap Scan (idx_example_table_start)
create index idx_example_table_range_gist ON example_table using gist (numrange(start_num, end_num, '[]' ));
-- GiST (Generalized Search Tree)
select * from example_table
where
numeric '100' <@ numrange(start_num, end_num, '[]');
-- COST 385.91 - 13086.03 -- Bitmap Heap Scan (idx_example_table_range_gist)
-- Dummy table with some data
create temporary table example_table (
alfa int,
beta int
);
insert into example_table values(1, 2);
insert into example_table values(10, 30);
insert into example_table values(10, 10);
insert into example_table values(10, 0);
select * from example_table;
-- Add third column `gamma` that will hold the value of `alfa` + `beta`
alter table example_table
add column gamma int;
-- Create trigger to calculate `gamma` for new data
create or replace function calculate_gamma()
returns trigger as $$
begin
new.gamma := new.alfa + new.beta;
return new;
end;
$$ language plpgsql;
create trigger trigger_calculate_gamma
before insert on example_table
for each row
execute function calculate_gamma();
insert into example_table values(0, 0);
insert into example_table values(1, 1);
-- will work, but the old values will continue with null values of gamma
update example_table
set gamma = alfa + beta;
-- update all rows
create table if not exists documents (
id SERIAL PRIMARY KEY,
title TEXT,
content TEXT,
tsv_content tsvector null,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
);
CREATE INDEX idx_tsv_content ON documents USING gin(tsv_content);
-- Create a trigger to calculate tsv
create or replace function calc_tsv()
returns trigger as $$
begin
new.tsv_content := to_tsvector('english', new.title || ' ' || new.content);
return new;
end;
$$ language plpgsql;
create trigger trigger_calc_tsv
before insert or update on documents
for each row
execute function calc_tsv();
-- drop trigger trigger_calc_tsv on documents;
-- Sample from chatgpt :D
INSERT INTO documents (title, content)
VALUES
('PostgreSQL Full-Text Search Guide', 'This guide covers how to perform full-text search in PostgreSQL. It includes creating tsvectors, tsqueries, and indexing.'),
('Introduction to Database Indexing', 'Database indexing improves the speed of data retrieval operations on a table. This tutorial explains how indexes work in databases like PostgreSQL and MySQL.'),
('Advanced Search Techniques', 'Learn how to implement advanced search techniques in PostgreSQL. This includes phrase search, prefix search, and ranking results by relevance.'),
('What is SQL?', 'Structured Query Language, or SQL, is a standard programming language for managing relational databases. It is used to perform queries, update data, and more.'),
('PostgreSQL vs MySQL: A Comparison', 'In this article, we compare PostgreSQL and MySQL in terms of performance, scalability, and support for advanced features like full-text search.'),
('Building a REST API with FastAPI', 'This tutorial shows how to build a REST API using FastAPI, including endpoints for creating, reading, updating, and deleting data.'),
('Data Engineering Best Practices', 'Data engineering involves building systems for collecting, storing, and analyzing data. This article discusses the best practices in data pipeline design, scalability, and optimization.'),
('Python and SQL for Data Analysis', 'Learn how to combine Python with SQL for powerful data analysis. This tutorial covers using libraries like pandas and SQLAlchemy to query databases efficiently.'),
('Understanding GIN Indexes in PostgreSQL', 'GIN indexes are essential for efficient full-text search in PostgreSQL. This guide explains how to create and use GIN indexes to speed up searches.'),
('Optimizing Queries in PostgreSQL', 'Query optimization is key to improving database performance. This guide covers query planning, indexing strategies, and analyzing query performance.')
('Introduction to Machine Learning', 'Machine learning is a field of artificial intelligence that uses statistical techniques to give computer systems the ability to learn from data. This article covers the basic concepts.'),
('Supervised vs Unsupervised Learning', 'This document explains the key differences between supervised and unsupervised learning, with examples of common algorithms used in each approach.'),
('Deep Learning for Image Recognition', 'Deep learning has revolutionized image recognition, enabling machines to achieve superhuman accuracy. This article discusses popular models like Convolutional Neural Networks (CNNs).'),
('Building a Neural Network from Scratch', 'Learn how to build a neural network from scratch using Python. This tutorial walks through each step, including forward propagation, backpropagation, and gradient descent.'),
('Natural Language Processing with Machine Learning', 'This document covers natural language processing (NLP) techniques and how machine learning models, such as recurrent neural networks (RNNs) and transformers, are applied to text data.'),
('Implementing a Recommendation System', 'Recommendation systems use machine learning to provide personalized suggestions. This article explains collaborative filtering and content-based recommendation algorithms.'),
('Transfer Learning in Deep Learning', 'Transfer learning allows models to leverage pre-trained weights from other tasks to solve new problems. This article explains how transfer learning works and its applications.'),
('Reinforcement Learning: An Overview', 'Reinforcement learning is a type of machine learning where agents learn to make decisions by receiving rewards or penalties. This article explores Q-learning, policy gradients, and other RL techniques.'),
('Training a Deep Learning Model with TensorFlow', 'TensorFlow is a popular open-source framework for training deep learning models. This guide covers setting up the environment, defining models, and training them on datasets.'),
('Bias and Variance in Machine Learning Models', 'Understanding the tradeoff between bias and variance is critical for building accurate models. This article explains these concepts and how to address overfitting and underfitting.'),
('Hyperparameter Tuning for Machine Learning Models', 'Hyperparameter tuning can significantly improve model performance. Learn about techniques like grid search, random search, and Bayesian optimization to find the best model parameters.'),
('AI Ethics and Fairness in Machine Learning', 'As AI systems become more integrated into society, ensuring fairness and avoiding bias in machine learning models has become crucial. This article delves into the ethical challenges of AI development.');
-- Search for 'Machine' and 'System'
SELECT id, title, ts_rank(d.tsv_content, to_tsquery('Machine & System')) AS rank
FROM documents d
WHERE d.tsv_content @@ to_tsquery('Machine & System')
/*
Introduction to Machine Learning
Implementing a Recommendation System
AI Ethics and Fairness in Machine Learning
*/
-- Search for 'Machine' and 'Learning' that are not together
SELECT * FROM documents
WHERE tsv_content @@ to_tsquery('english', 'machine & learning')
EXCEPT
SELECT * FROM documents
WHERE tsv_content @@ to_tsquery('english', 'machine <-> learning') -- 'Machine' Near to 'Learning'
/*
1) Deep Learning for Image Recognition; Deep learning has revolutionized image recognition,
enabling machines to achieve superhuman accuracy. This article discusses popular models
like Convolutional Neural Networks (CNNs).
*/
-- Search for 'Deep' and 'learning' and NOT 'machine'
SELECT * FROM documents
WHERE tsv_content @@ to_tsquery('english', '!machine & learning & Deep')
/*
Transfer Learning in Deep Learning
Training a Deep Learning Model with TensorFlow
*/
-- Search for all words starting with "Postg*"
SELECT * FROM documents d
WHERE d.tsv_content @@ to_tsquery('Postg:*')
/*
PostgreSQL Full-Text Search Guide
PostgreSQL vs MySQL: A Comparison
Understanding GIN Indexes in PostgreSQL
Optimizing Queries in PostgreSQL
...
*/
For this example we will create three product tables: prod_price
, prod_stock
, prod_info
and they will be summarized in the prod_sumarização
table.
-- Create Data/Structure
CREATE TABLE prod_price (
product_id INT NOT NULL,
store_id INT NOT NULL,
price NUMERIC(10, 2),
PRIMARY KEY (product_id, store_id)
);
INSERT INTO prod_price (product_id, store_id, price)
VALUES
(1, 101, 19.99),
(1, 102, 10.00),
(1, 103, 20.00),
(1, 104, NULL),
(2, 101, NULL),
(2, 102, 29.99),
(2, 103, 29.99),
(3, 103, 14.49),
(4, 101, 9.99),
(4, 999, 9.99),
(5, 102, 39.99),
(999, 102, 39.99),
(999, 105, NULL);
CREATE TABLE prod_stock (
product_id INT NOT NULL,
store_id INT NOT NULL,
stock INT NOT NULL,
PRIMARY KEY (product_id, store_id)
);
INSERT INTO prod_stock (product_id, store_id, stock)
VALUES
(1, 101, 50),
(1, 102, 0),
(1, 103, 3),
(2, 101, 1),
(2, 102, 30),
(3, 103, 100),
(3, 104, 1),
(3, 105, 5),
(4, 101, 10),
(4, 888, 10),
(5, 102, 0);
(888, 102, 10);
CREATE TABLE if not exists prod_info(
product_id INT PRIMARY KEY,
title TEXT NOT NULL,
tags TEXT[],
description TEXT
);
INSERT INTO prod_info (product_id, title, tags, description)
VALUES
(1, 'Wireless Mouse', ARRAY['electronics', 'peripherals'], 'Compact and ergonomic design.'),
(2, 'Gaming Keyboard', ARRAY['electronics', 'gaming'], 'Mechanical keys with RGB lighting.'),
(3, 'USB-C Cable', ARRAY['electronics', 'accessories'], 'Durable and fast charging.'),
(4, 'Notebook Stand', ARRAY['office', 'ergonomics'], 'Adjustable aluminum stand for laptops.'),
(5, 'Noise Cancelling Headphones', ARRAY['electronics', 'audio'], 'Over-ear headphones with ANC.');
CREATE TABLE prod_summarization (
product_id INT NOT NULL,
store_id INT NOT NULL,
price NUMERIC(10, 2),
stock INT,
title TEXT,
tags TEXT[],
description TEXT,
PRIMARY KEY (product_id, store_id)
);
-- Create a temporary table to help identify removed/added/changed items.
create table temp_prod_summarization as table prod_summarization with no data;
-- This temp table has no index/pk
delete from temp_prod_summarization;
-- Add data into a empty temp_prod_summarization
insert into temp_prod_summarization
(product_id, store_id, price, stock, title, tags, description)
SELECT
p.product_id,
p.store_id,
p.price,
s.stock,
i.title,
i.tags,
i.description
FROM prod_price p
JOIN prod_stock s
ON p.product_id = s.product_id AND p.store_id = s.store_id
JOIN prod_info i
ON p.product_id = i.product_id
where 1=1
and s.stock > 0
and p.price > 0 --and 'ergonomics'=any(i.tags)
-- Add/Update data
INSERT INTO prod_summarization (
product_id, store_id, price, stock, title, tags, description
)
SELECT
product_id, store_id, price, stock, title, tags, description
FROM temp_prod_summarization
ON CONFLICT (product_id, store_id)
DO UPDATE SET
price = EXCLUDED.price,
stock = EXCLUDED.stock,
title = EXCLUDED.title,
tags = EXCLUDED.tags,
description = EXCLUDED.description
WHERE prod_summarization.price <> EXCLUDED.price
OR prod_summarization.stock <> EXCLUDED.stock
OR prod_summarization.title <> EXCLUDED.title
OR prod_summarization.tags <> EXCLUDED.tags
OR prod_summarization.description <> EXCLUDED.description;
-- Delete Data from prod_summarization
DELETE FROM prod_summarization
WHERE (product_id, store_id) NOT IN (
SELECT product_id, store_id FROM temp_prod_summarization
);
-- Remember to keep temp_prod_summarization empty
delete from temp_prod_summarization;
-- TODO: Test this strategy with more data
I recently read the PostgreSQL book/tutorial A Curious Moon. And I decided to do a TL;DR (similar to the previous post about Oracle SQL), although this time focused on the teachings of the book.
The TLDR as well as the code used can be found in the repository: tldr_a_curious_moon.
The idea of the post is to have a streamlined version of the teachings I had per chapter, focused on the Postgres itself. I found it interesting to change the database to avoid a simple copy/paste by myself. I chose a public dataset from the Brazilian government regarding confiscated cigarettes (and destroyed), that I may (or may not) change in the future.
In this section we will use docker-compose to run a local postgres instance, to start run:
docker compose up --remove-orphans [-d]
create
must have a drop * if exists [cascade]
Sargable
(Search ARGument ABLE) if it can take advantage of an index to speed up the execution of the queryLIKE %*
as it is non-sargable
ILIKE
is case-insensitiveto_tsvector(my_text_column)
and search @@ to_tsquery('my_search_query')
Materialized View
is stored in disk (and so, allowed to create an index). Once the original data is updated, it is necessary to update the materialized view: REFRESH MATERIALIZED VIEW my_view_name
ls */*.txt | wc -l
csvsql my_csv_path -i postgresql –tables “import.my_name” –noconstraints –overwrite sed ‘s/VARCHAR/text/g’ > import.sql
cut
command to retrieve the desired columns: cut -d ';' -f 1-2,4-6 user_data/cigarros.csv
with temp_table_name as ( select ...), other_name as (select ... where temp_table_name.x = 2) select * from other_name
with count_events as (select count(*) from events),
count_regions as (select count(*) from regions)
select * from count_events, count_regions;
drop function if exists my_function(arguments_types);
create function my_function (arguments)
returns return_datatype
IMMUTABLE -- if pure function (no side effects, same input same output)
as $$
declare
...
begin
...
return my_return_value
end;
$$ language sql;
Or similar to the one presented in the book
drop function if exists max_price_per_region(int);
create function max_price_per_region(rid int, out numeric)
as $$
select max(price) from events where events.region_id=rid
-- implicity return the first row of this select
$$ language sql;
To other languages, such as python, it is necessary to run CREATE LANGUAGE [plpython3u]
that requires installing python itself along with Postgres.
count(1) over (partition by ...)
(you may want to add distinct
in after the select
)as example, the percentage of entries per year with max/min price per year.
select distinct
year,
round(100.0 * (count(1) over (partition by year)::numeric
/ (count(1) over())::numeric), 2) as percentage_of_entries,
(max(price) over (partition by year)::numeric
/ (count(1) over())::numeric) as max_price,
(min(price) over (partition by year)::numeric
/ (count(1) over())::numeric) as min_price
from events;
CREATE TEMPORARY TABLE my_temp_table_name
explain [analyze] my_query
to shows the execution plan of a query, it is important to check its cost
and its scans
cost
is a relative measurement, which you may want to minimize.sequential scan
, index scan
, …\d my_element
describes a element (table, view, index, …)junction table
is used to support relationship between two tables. The junction table contains the two foreign keys and creates its own primary key using those two foreign keys.where
(e.g.: index where a column is not null), and also concurrently
(without paying)tsrange
(timestamp range) in which you define a window (start, end) and inclusive/exclusive options.int4range
, numrange
, daterange
…@>
e.g: SELECT int4range(10, 30) @> 20
, to compute the intersection between two ranges use *
, and to check if the ranges overlaps, use &&