DB: SQL Postgres: TL;DR

01 May 2023, Carlos Pena

Install

Start Postgres DB with docker

$ docker run -d -p 5432:5432 -e POSTGRES_PASSWORD=pass \
    -v pgdata:/data/postgres -v user_data:/user_data docker.io/postgres

-- To Connect
$ psql postgres://postgres:pass@localhost:5432

Data Validation

drop domain if exists address_num_type cascade;
create domain address_num_type AS integer
check (VALUE > 0 and VALUE < 10000);

drop domain if exists cep_type cascade;
create domain cep_type AS TEXT
check(
    VALUE ~ '^\d{7,8}$'
    OR VALUE ~ '^\d{5}-\d{3}$'
);

drop table ADDRESS;
create table ADDRESS(
  id serial primary key,
  num address_num_type not null,
  cep cep_type not null
);

insert into ADDRESS values (default, 20, '50000000');
-- Done
insert into ADDRESS values (default, -1, '50000000');
-- SQL Error [23514]: ERROR: value for domain address_num_type violates check
-- constraint "address_num_type_check"
insert into ADDRESS values (default, 20, '500');
-- SQL Error [23514]: ERROR: value for domain cep_type violates check
-- constraint "cep_type_check"

On Conflict

Equivalent to MERGE.

CREATE TEMPORARY TABLE chcp_users (
    id SERIAL PRIMARY KEY,
    cpf TEXT UNIQUE,
    name text,
    age numeric
);


INSERT INTO chcp_users (cpf, name, age)
VALUES ('11122233345', 'Carlos Pena', 27)
ON CONFLICT (cpf)
DO UPDATE SET
    name = EXCLUDED.name,
    age = EXCLUDED.age;

select * from chcp_users
-- 1    11122233345 Carlos Pena 27

INSERT INTO chcp_users (cpf, name, age)
VALUES ('11122233345', 'Carlos H C Pena', 28)
ON CONFLICT (cpf)
DO UPDATE SET
    name = EXCLUDED.name,
    age = EXCLUDED.age;

select * from chcp_users
-- 1    11122233345 Carlos H C Pena 28


INSERT INTO chcp_users (cpf, name, age)
VALUES ('11122233345', 'AAAAAAAAA', -1)
ON CONFLICT (cpf)
DO nothing
-- Updated Rows: 0 (error has been suppressed)

select * from chcp_users
-- 1    11122233345 Carlos H C Pena 28

Numrange index

CREATE TABLE example_table (
    start_num NUMERIC,
    end_num NUMERIC,
    text_column text
);


CREATE OR REPLACE PROCEDURE populate_table(
    in_count INTEGER
)
LANGUAGE plpgsql
AS $$
DECLARE
    i INTEGER := 1;
    random_st INTEGER := 0;
    random_range INTEGER :=  0;
BEGIN
    WHILE i <= in_count loop
        random_st := random() * 1000;
        random_range :=  random() * 1000;

        INSERT INTO example_table (start_num, end_num, text_column)
        VALUES (random_st, random_st+random_range, 'Row ' || i);
        i := i + 1;
    END LOOP;
END;
$$;

CALL populate_table(2000000);


select * from example_table
where
    100 >= start_num and
    100 <= end_num;
-- COST 0.00 - 42739.00 -- Seq Scan
create index idx_example_table_start on example_table(start_num);
create index idx_example_table_end on example_table(end_num);
create index idx_example_table_start_end on example_table(start_num, end_num desc);


select * from example_table
where
    100 >= start_num and
    100 <= end_num;
-- COST 3798.16 - 19575.96 -- Bitmap Heap Scan (idx_example_table_start)

select * from example_table
where
    100 between start_num and end_num;
-- COST 3798.16 - 19575.96 -- Bitmap Heap Scan (idx_example_table_start)


create index idx_example_table_range_gist ON example_table using gist (numrange(start_num, end_num, '[]' ));
-- GiST (Generalized Search Tree)

select * from example_table
where
    numeric '100' <@ numrange(start_num, end_num, '[]');
-- COST 385.91 - 13086.03 -- Bitmap Heap Scan (idx_example_table_range_gist)

Trigger

-- Dummy table with some data
create temporary table example_table (
    alfa int,
    beta int
);

insert into example_table values(1, 2);
insert into example_table values(10, 30);
insert into example_table values(10, 10);
insert into example_table values(10, 0);
select * from example_table;
-- Add third column `gamma` that will hold the value of `alfa` + `beta`
alter table example_table
add column gamma int;

-- Create trigger to calculate `gamma` for new data
create or replace function calculate_gamma()
returns trigger as $$
begin
    new.gamma := new.alfa + new.beta;
    return new;
end;
$$ language plpgsql;

create trigger trigger_calculate_gamma
before insert on example_table
for each row
execute function calculate_gamma();

insert into example_table values(0, 0);
insert into example_table values(1, 1);
-- will work, but the old values will continue with null values of gamma

update example_table
set gamma = alfa + beta;
-- update all rows

Full-Text Search

create table if not exists documents (
    id SERIAL PRIMARY KEY,
    title TEXT,
    content TEXT,
    tsv_content tsvector null,
    created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
);

CREATE INDEX idx_tsv_content ON documents USING gin(tsv_content);

-- Create a trigger to calculate tsv
create or replace function calc_tsv()
returns trigger as $$
begin
    new.tsv_content := to_tsvector('english', new.title || ' ' || new.content);
    return new;
end;
$$ language plpgsql;

create trigger trigger_calc_tsv
before insert or update on documents
for each row
execute function calc_tsv();
-- drop trigger trigger_calc_tsv on documents;

-- Sample from chatgpt :D
INSERT INTO documents (title, content)
VALUES
('PostgreSQL Full-Text Search Guide', 'This guide covers how to perform full-text search in PostgreSQL. It includes creating tsvectors, tsqueries, and indexing.'),
('Introduction to Database Indexing', 'Database indexing improves the speed of data retrieval operations on a table. This tutorial explains how indexes work in databases like PostgreSQL and MySQL.'),
('Advanced Search Techniques', 'Learn how to implement advanced search techniques in PostgreSQL. This includes phrase search, prefix search, and ranking results by relevance.'),
('What is SQL?', 'Structured Query Language, or SQL, is a standard programming language for managing relational databases. It is used to perform queries, update data, and more.'),
('PostgreSQL vs MySQL: A Comparison', 'In this article, we compare PostgreSQL and MySQL in terms of performance, scalability, and support for advanced features like full-text search.'),
('Building a REST API with FastAPI', 'This tutorial shows how to build a REST API using FastAPI, including endpoints for creating, reading, updating, and deleting data.'),
('Data Engineering Best Practices', 'Data engineering involves building systems for collecting, storing, and analyzing data. This article discusses the best practices in data pipeline design, scalability, and optimization.'),
('Python and SQL for Data Analysis', 'Learn how to combine Python with SQL for powerful data analysis. This tutorial covers using libraries like pandas and SQLAlchemy to query databases efficiently.'),
('Understanding GIN Indexes in PostgreSQL', 'GIN indexes are essential for efficient full-text search in PostgreSQL. This guide explains how to create and use GIN indexes to speed up searches.'),
('Optimizing Queries in PostgreSQL', 'Query optimization is key to improving database performance. This guide covers query planning, indexing strategies, and analyzing query performance.')
('Introduction to Machine Learning', 'Machine learning is a field of artificial intelligence that uses statistical techniques to give computer systems the ability to learn from data. This article covers the basic concepts.'),
('Supervised vs Unsupervised Learning', 'This document explains the key differences between supervised and unsupervised learning, with examples of common algorithms used in each approach.'),
('Deep Learning for Image Recognition', 'Deep learning has revolutionized image recognition, enabling machines to achieve superhuman accuracy. This article discusses popular models like Convolutional Neural Networks (CNNs).'),
('Building a Neural Network from Scratch', 'Learn how to build a neural network from scratch using Python. This tutorial walks through each step, including forward propagation, backpropagation, and gradient descent.'),
('Natural Language Processing with Machine Learning', 'This document covers natural language processing (NLP) techniques and how machine learning models, such as recurrent neural networks (RNNs) and transformers, are applied to text data.'),
('Implementing a Recommendation System', 'Recommendation systems use machine learning to provide personalized suggestions. This article explains collaborative filtering and content-based recommendation algorithms.'),
('Transfer Learning in Deep Learning', 'Transfer learning allows models to leverage pre-trained weights from other tasks to solve new problems. This article explains how transfer learning works and its applications.'),
('Reinforcement Learning: An Overview', 'Reinforcement learning is a type of machine learning where agents learn to make decisions by receiving rewards or penalties. This article explores Q-learning, policy gradients, and other RL techniques.'),
('Training a Deep Learning Model with TensorFlow', 'TensorFlow is a popular open-source framework for training deep learning models. This guide covers setting up the environment, defining models, and training them on datasets.'),
('Bias and Variance in Machine Learning Models', 'Understanding the tradeoff between bias and variance is critical for building accurate models. This article explains these concepts and how to address overfitting and underfitting.'),
('Hyperparameter Tuning for Machine Learning Models', 'Hyperparameter tuning can significantly improve model performance. Learn about techniques like grid search, random search, and Bayesian optimization to find the best model parameters.'),
('AI Ethics and Fairness in Machine Learning', 'As AI systems become more integrated into society, ensuring fairness and avoiding bias in machine learning models has become crucial. This article delves into the ethical challenges of AI development.');
-- Search for 'Machine' and 'System'
SELECT id, title, ts_rank(d.tsv_content, to_tsquery('Machine & System')) AS rank
FROM documents d
WHERE d.tsv_content @@ to_tsquery('Machine & System')
/*
    Introduction to Machine Learning
    Implementing a Recommendation System
    AI Ethics and Fairness in Machine Learning
*/

-- Search for 'Machine' and 'Learning' that are not together
SELECT * FROM documents
WHERE tsv_content @@ to_tsquery('english', 'machine & learning')
EXCEPT
SELECT * FROM documents
WHERE tsv_content @@ to_tsquery('english', 'machine <-> learning') -- 'Machine' Near to 'Learning'
/*
    1) Deep Learning for Image Recognition; Deep learning has revolutionized image recognition,
    enabling machines to achieve superhuman accuracy. This article discusses popular models
    like Convolutional Neural Networks (CNNs).
*/

-- Search for 'Deep' and 'learning' and NOT 'machine'
SELECT * FROM documents
WHERE tsv_content @@ to_tsquery('english', '!machine & learning & Deep')
/*
    Transfer Learning in Deep Learning
    Training a Deep Learning Model with TensorFlow
*/

-- Search for all words starting with "Postg*"
SELECT * FROM documents d
WHERE d.tsv_content @@ to_tsquery('Postg:*')
/*
    PostgreSQL Full-Text Search Guide
    PostgreSQL vs MySQL: A Comparison
    Understanding GIN Indexes in PostgreSQL
    Optimizing Queries in PostgreSQL
    ...
*/

Differential Updates

For this example we will create three product tables: prod_price, prod_stock, prod_info and they will be summarized in the prod_sumarização table.

-- Create Data/Structure
CREATE TABLE prod_price (
    product_id INT NOT NULL,
    store_id INT NOT NULL,
    price NUMERIC(10, 2),
    PRIMARY KEY (product_id, store_id)
);

INSERT INTO prod_price (product_id, store_id, price)
VALUES
    (1, 101, 19.99),
    (1, 102, 10.00),
    (1, 103, 20.00),
    (1, 104, NULL),
    (2, 101, NULL),
    (2, 102, 29.99),
    (2, 103, 29.99),
    (3, 103, 14.49),
    (4, 101, 9.99),
    (4, 999, 9.99),
    (5, 102, 39.99),
    (999, 102, 39.99),
    (999, 105, NULL);


CREATE TABLE prod_stock (
    product_id INT NOT NULL,
    store_id INT NOT NULL,
    stock INT NOT NULL,
    PRIMARY KEY (product_id, store_id)
);

INSERT INTO prod_stock (product_id, store_id, stock)
VALUES
    (1, 101, 50),
    (1, 102, 0),
    (1, 103, 3),
    (2, 101, 1),
    (2, 102, 30),
    (3, 103, 100),
    (3, 104, 1),
    (3, 105, 5),
    (4, 101, 10),
    (4, 888, 10),
    (5, 102, 0);
    (888, 102, 10);

CREATE TABLE if not exists prod_info(
    product_id INT PRIMARY KEY,
    title TEXT NOT NULL,
    tags TEXT[],
    description TEXT
);

INSERT INTO prod_info (product_id, title, tags, description)
VALUES
    (1, 'Wireless Mouse', ARRAY['electronics', 'peripherals'], 'Compact and ergonomic design.'),
    (2, 'Gaming Keyboard', ARRAY['electronics', 'gaming'], 'Mechanical keys with RGB lighting.'),
    (3, 'USB-C Cable', ARRAY['electronics', 'accessories'], 'Durable and fast charging.'),
    (4, 'Notebook Stand', ARRAY['office', 'ergonomics'], 'Adjustable aluminum stand for laptops.'),
    (5, 'Noise Cancelling Headphones', ARRAY['electronics', 'audio'], 'Over-ear headphones with ANC.');


CREATE TABLE prod_summarization (
    product_id INT NOT NULL,
    store_id INT NOT NULL,
    price NUMERIC(10, 2),
    stock INT,
    title TEXT,
    tags TEXT[],
    description TEXT,
    PRIMARY KEY (product_id, store_id)
);

-- Create a temporary table to help identify removed/added/changed items.
create table temp_prod_summarization as table prod_summarization with no data;
-- This temp table has no index/pk

delete from temp_prod_summarization;

-- Add data into a empty temp_prod_summarization
insert into temp_prod_summarization
    (product_id, store_id, price, stock, title, tags, description)
SELECT
    p.product_id,
    p.store_id,
    p.price,
    s.stock,
    i.title,
    i.tags,
    i.description
FROM prod_price p
JOIN prod_stock s
    ON p.product_id = s.product_id AND p.store_id = s.store_id
JOIN prod_info i
    ON p.product_id = i.product_id
where 1=1
    and s.stock > 0
    and p.price > 0  --and 'ergonomics'=any(i.tags)

-- Add/Update data
INSERT INTO prod_summarization (
    product_id, store_id, price, stock, title, tags, description
)
SELECT
    product_id, store_id, price, stock, title, tags, description
FROM temp_prod_summarization
ON CONFLICT (product_id, store_id)
DO UPDATE SET
    price = EXCLUDED.price,
    stock = EXCLUDED.stock,
    title = EXCLUDED.title,
    tags = EXCLUDED.tags,
    description = EXCLUDED.description
WHERE prod_summarization.price <> EXCLUDED.price
   OR prod_summarization.stock <> EXCLUDED.stock
   OR prod_summarization.title <> EXCLUDED.title
   OR prod_summarization.tags <> EXCLUDED.tags
   OR prod_summarization.description <> EXCLUDED.description;

-- Delete Data from prod_summarization
DELETE FROM prod_summarization
WHERE (product_id, store_id) NOT IN (
    SELECT product_id, store_id FROM temp_prod_summarization
);

-- Remember to keep temp_prod_summarization empty
delete from temp_prod_summarization;

-- TODO: Test this strategy with more data

TL;DR Of A Curious Moon

I recently read the PostgreSQL book/tutorial A Curious Moon. And I decided to do a TL;DR (similar to the previous post about Oracle SQL), although this time focused on the teachings of the book.

The TLDR as well as the code used can be found in the repository: tldr_a_curious_moon.

The idea of the post is to have a streamlined version of the teachings I had per chapter, focused on the Postgres itself. I found it interesting to change the database to avoid a simple copy/paste by myself. I chose a public dataset from the Brazilian government regarding confiscated cigarettes (and destroyed), that I may (or may not) change in the future.

A Curious Moon by Rob Conery

Before the book

In this section we will use docker-compose to run a local postgres instance, to start run:

docker compose up --remove-orphans [-d]

Transit

In Orbit

Flyby

A Bent Field

Sniff The Sky

csvsql my_csv_path -i postgresql –tables “import.my_name” –noconstraints –overwrite sed ‘s/VARCHAR/text/g’ > import.sql
with count_events as (select count(*) from events),
     count_regions as (select count(*) from regions)
select * from count_events, count_regions;

Ring Dust

drop function if exists my_function(arguments_types);
create function my_function (arguments)
returns return_datatype
IMMUTABLE -- if pure function (no side effects, same input same output)
as $$
   declare
      ...
   begin
      ...
      return my_return_value
   end;
$$ language sql;

Or similar to the one presented in the book

drop function if exists max_price_per_region(int);
create function max_price_per_region(rid int, out numeric)
as $$
    select max(price) from events where events.region_id=rid
    -- implicity return the first row of this select
$$ language sql;

To other languages, such as python, it is necessary to run CREATE LANGUAGE [plpython3u] that requires installing python itself along with Postgres.

as example, the percentage of entries per year with max/min price per year.

select distinct
    year,
    round(100.0 * (count(1) over (partition by year)::numeric
            / (count(1) over())::numeric), 2) as percentage_of_entries,
    (max(price) over (partition by year)::numeric
            / (count(1) over())::numeric) as max_price,
    (min(price) over (partition by year)::numeric
            / (count(1) over())::numeric) as min_price
from events;

A Tight Ship

Gravity Assist

Under The Ice

Glide Path and Epilog