Skip to content

PostgreSQL Advanced Features

This guide covers advanced PostgreSQL features available in Datablast SQL tasks, including JSON support, window functions, custom functions, and advanced data types.

PostgreSQL provides advanced features that enable complex data processing, analytics, and custom logic. These features include JSON support, window functions, custom functions, and advanced data types.

  • JSON Support: Native JSON and JSONB data types
  • Window Functions: Advanced analytical functions
  • Custom Functions: User-defined functions and procedures
  • Advanced Data Types: Arrays, ranges, and custom types
  • CTEs: Common Table Expressions for complex queries

PostgreSQL’s JSONB data type provides efficient storage and querying of JSON data.

-- Create table with JSONB column
CREATE TABLE user_sessions (
user_id VARCHAR,
session_id VARCHAR,
properties JSONB,
created_at TIMESTAMP DEFAULT NOW()
);
-- Query JSONB data
SELECT
user_id,
session_id,
properties->>'browser' as browser,
properties->>'device' as device,
properties->'location'->>'country' as country,
properties->'location'->>'city' as city
FROM user_sessions
WHERE created_at >= '{{ ds }}'::date
AND properties ? 'location'
-- Use JSONB operators
SELECT
user_id,
properties->>'browser' as browser, -- Get text value
properties->'tags' as tags, -- Get JSON value
properties->'tags'->0 as first_tag, -- Get array element
properties ? 'location' as has_location, -- Check key existence
properties @> '{"device": "mobile"}' as is_mobile -- Contains check
FROM user_sessions
WHERE created_at >= '{{ ds }}'::date
-- Use JSONB functions
SELECT
user_id,
jsonb_array_length(properties->'tags') as tag_count,
jsonb_object_keys(properties) as property_keys,
jsonb_pretty(properties) as formatted_properties
FROM user_sessions
WHERE created_at >= '{{ ds }}'::date
AND properties ? 'tags'
-- Aggregate JSON data
SELECT
user_id,
jsonb_agg(
jsonb_build_object(
'session_id', session_id,
'browser', properties->>'browser',
'device', properties->>'device'
)
) as session_summary
FROM user_sessions
WHERE created_at >= '{{ ds }}'::date
GROUP BY user_id
-- Build JSON objects
SELECT
user_id,
jsonb_build_object(
'total_sessions', COUNT(*),
'browsers', jsonb_agg(DISTINCT properties->>'browser'),
'devices', jsonb_agg(DISTINCT properties->>'device')
) as user_summary
FROM user_sessions
WHERE created_at >= '{{ ds }}'::date
GROUP BY user_id
-- Use ROW_NUMBER() for ranking
SELECT
user_id,
order_date,
revenue,
ROW_NUMBER() OVER (
PARTITION BY user_id
ORDER BY order_date DESC
) as order_rank
FROM orders
WHERE order_date >= '{{ ds }}'::date
-- Use RANK() and DENSE_RANK()
SELECT
user_id,
revenue,
RANK() OVER (ORDER BY revenue DESC) as revenue_rank,
DENSE_RANK() OVER (ORDER BY revenue DESC) as dense_revenue_rank
FROM user_totals
WHERE order_date >= '{{ ds }}'::date
-- Use LAG() and LEAD() for time-series analysis
SELECT
user_id,
order_date,
revenue,
LAG(revenue) OVER (
PARTITION BY user_id
ORDER BY order_date
) as prev_revenue,
LEAD(revenue) OVER (
PARTITION BY user_id
ORDER BY order_date
) as next_revenue
FROM orders
WHERE order_date >= '{{ ds }}'::date
-- Calculate running totals
SELECT
user_id,
order_date,
revenue,
SUM(revenue) OVER (
PARTITION BY user_id
ORDER BY order_date
ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW
) as running_total
FROM orders
WHERE order_date >= '{{ ds }}'::date
-- Calculate moving averages
SELECT
user_id,
order_date,
revenue,
AVG(revenue) OVER (
PARTITION BY user_id
ORDER BY order_date
ROWS BETWEEN 2 PRECEDING AND CURRENT ROW
) as moving_avg_3
FROM orders
WHERE order_date >= '{{ ds }}'::date
-- Use percentile functions
SELECT
user_id,
revenue,
PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY revenue) as median_revenue,
PERCENTILE_CONT(0.9) WITHIN GROUP (ORDER BY revenue) as p90_revenue
FROM user_totals
WHERE order_date >= '{{ ds }}'::date
-- Use ROWS frame
SELECT
user_id,
order_date,
revenue,
SUM(revenue) OVER (
PARTITION BY user_id
ORDER BY order_date
ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING
) as sum_3_rows
FROM orders
WHERE order_date >= '{{ ds }}'::date
-- Use RANGE frame
SELECT
user_id,
order_date,
revenue,
SUM(revenue) OVER (
PARTITION BY user_id
ORDER BY order_date
RANGE BETWEEN INTERVAL '1 day' PRECEDING AND INTERVAL '1 day' FOLLOWING
) as sum_3_days
FROM orders
WHERE order_date >= '{{ ds }}'::date
-- Use simple CTE
WITH user_totals AS (
SELECT
user_id,
SUM(revenue) as total_revenue,
COUNT(*) as order_count
FROM orders
WHERE order_date >= '{{ ds }}'::date
GROUP BY user_id
)
SELECT
user_id,
total_revenue,
order_count,
total_revenue / order_count as avg_order_value
FROM user_totals
WHERE total_revenue > 1000
-- Use multiple CTEs
WITH user_totals AS (
SELECT
user_id,
SUM(revenue) as total_revenue,
COUNT(*) as order_count
FROM orders
WHERE order_date >= '{{ ds }}'::date
GROUP BY user_id
),
user_ranks AS (
SELECT
user_id,
total_revenue,
RANK() OVER (ORDER BY total_revenue DESC) as revenue_rank
FROM user_totals
)
SELECT
u.user_id,
u.total_revenue,
u.revenue_rank,
CASE
WHEN u.revenue_rank <= 10 THEN 'Top 10'
WHEN u.revenue_rank <= 100 THEN 'Top 100'
ELSE 'Other'
END as user_tier
FROM user_ranks u
-- Use recursive CTE for hierarchical data
WITH RECURSIVE user_hierarchy AS (
-- Base case
SELECT
user_id,
parent_user_id,
user_name,
1 as level
FROM users
WHERE parent_user_id IS NULL
UNION ALL
-- Recursive case
SELECT
u.user_id,
u.parent_user_id,
u.user_name,
uh.level + 1
FROM users u
JOIN user_hierarchy uh ON u.parent_user_id = uh.user_id
)
SELECT
user_id,
user_name,
level
FROM user_hierarchy
ORDER BY level, user_id
-- Use recursive CTE for number generation
WITH RECURSIVE numbers AS (
SELECT 1 as n
UNION ALL
SELECT n + 1
FROM numbers
WHERE n < 100
)
SELECT n
FROM numbers
-- Create table with array column
CREATE TABLE user_tags (
user_id VARCHAR,
tags TEXT[],
created_at TIMESTAMP DEFAULT NOW()
);
-- Query array data
SELECT
user_id,
tags,
array_length(tags, 1) as tag_count,
tags[1] as first_tag,
'premium' = ANY(tags) as is_premium
FROM user_tags
WHERE created_at >= '{{ ds }}'::date
-- Use array functions
SELECT
user_id,
tags,
array_agg(DISTINCT unnest(tags)) as unique_tags,
array_length(tags, 1) as tag_count
FROM user_tags
WHERE created_at >= '{{ ds }}'::date
GROUP BY user_id, tags
-- Create table with range column
CREATE TABLE user_sessions (
user_id VARCHAR,
session_range TSRANGE,
created_at TIMESTAMP DEFAULT NOW()
);
-- Query range data
SELECT
user_id,
session_range,
lower(session_range) as session_start,
upper(session_range) as session_end,
session_range @> NOW() as is_active
FROM user_sessions
WHERE created_at >= '{{ ds }}'::date
-- Create custom type
CREATE TYPE user_status AS ENUM ('active', 'inactive', 'suspended');
-- Create table with custom type
CREATE TABLE users (
user_id VARCHAR,
email VARCHAR,
status user_status,
created_at TIMESTAMP DEFAULT NOW()
);
-- Query custom type
SELECT
user_id,
email,
status,
CASE status
WHEN 'active' THEN 'User is active'
WHEN 'inactive' THEN 'User is inactive'
WHEN 'suspended' THEN 'User is suspended'
END as status_description
FROM users
WHERE created_at >= '{{ ds }}'::date
-- Create function for data validation
CREATE OR REPLACE FUNCTION validate_email(email TEXT)
RETURNS BOOLEAN AS $$
BEGIN
RETURN email ~ '^[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}$';
END;
$$ LANGUAGE plpgsql;
-- Use custom function
SELECT
user_id,
email,
validate_email(email) as is_valid_email
FROM users
WHERE created_at >= '{{ ds }}'::date
-- Create function with parameters
CREATE OR REPLACE FUNCTION calculate_user_score(
user_id TEXT,
start_date DATE,
end_date DATE
)
RETURNS DECIMAL AS $$
DECLARE
total_revenue DECIMAL;
order_count INTEGER;
user_score DECIMAL;
BEGIN
SELECT
COALESCE(SUM(revenue), 0),
COUNT(*)
INTO total_revenue, order_count
FROM orders
WHERE user_id = $1
AND order_date >= $2
AND order_date <= $3;
user_score := total_revenue * 0.1 + order_count * 0.01;
RETURN user_score;
END;
$$ LANGUAGE plpgsql;
-- Use function with parameters
SELECT
user_id,
calculate_user_score(
user_id,
'{{ ds }}'::date,
'{{ next_ds }}'::date
) as user_score
FROM users
WHERE created_at >= '{{ ds }}'::date
-- Create procedure for data processing
CREATE OR REPLACE PROCEDURE process_user_data(
process_date DATE
)
LANGUAGE plpgsql AS $$
BEGIN
-- Update user statistics
UPDATE users
SET
last_order_date = (
SELECT MAX(order_date)
FROM orders
WHERE user_id = users.user_id
AND order_date <= process_date
),
total_orders = (
SELECT COUNT(*)
FROM orders
WHERE user_id = users.user_id
AND order_date <= process_date
),
total_revenue = (
SELECT COALESCE(SUM(revenue), 0)
FROM orders
WHERE user_id = users.user_id
AND order_date <= process_date
)
WHERE created_at <= process_date;
-- Log processing completion
INSERT INTO processing_log (process_date, status, completed_at)
VALUES (process_date, 'completed', NOW());
END;
$$;
-- Call procedure
CALL process_user_data('{{ ds }}'::date);
-- Pivot data using CASE
SELECT
user_id,
SUM(CASE WHEN order_type = 'premium' THEN revenue ELSE 0 END) as premium_revenue,
SUM(CASE WHEN order_type = 'standard' THEN revenue ELSE 0 END) as standard_revenue,
SUM(CASE WHEN order_type = 'basic' THEN revenue ELSE 0 END) as basic_revenue
FROM orders
WHERE order_date >= '{{ ds }}'::date
GROUP BY user_id
-- Use FILTER clause for conditional aggregation
SELECT
user_id,
COUNT(*) as total_orders,
COUNT(*) FILTER (WHERE order_type = 'premium') as premium_orders,
SUM(revenue) FILTER (WHERE order_type = 'premium') as premium_revenue,
AVG(revenue) FILTER (WHERE order_type = 'premium') as avg_premium_revenue
FROM orders
WHERE order_date >= '{{ ds }}'::date
GROUP BY user_id
-- Use lateral join
SELECT
u.user_id,
u.email,
recent_orders.order_date,
recent_orders.revenue
FROM users u
CROSS JOIN LATERAL (
SELECT
order_date,
revenue
FROM orders o
WHERE o.user_id = u.user_id
AND o.order_date >= '{{ ds }}'::date
ORDER BY o.order_date DESC
LIMIT 3
) recent_orders
WHERE u.created_at >= '{{ ds }}'::date
-- Good: Use JSONB for storage
CREATE TABLE user_sessions (
user_id VARCHAR,
properties JSONB
);
-- Avoid: Use JSON for storage
CREATE TABLE user_sessions (
user_id VARCHAR,
properties JSON
);
-- Create GIN index for JSONB queries
CREATE INDEX idx_user_sessions_properties_gin
ON user_sessions USING GIN (properties);
-- Good: Use appropriate operators
SELECT properties->>'browser' as browser
FROM user_sessions
WHERE properties ? 'browser';
-- Avoid: Use wrong operators
SELECT properties->'browser' as browser
FROM user_sessions
WHERE properties ? 'browser';
-- Good: Use appropriate window frame
SELECT
user_id,
order_date,
revenue,
SUM(revenue) OVER (
PARTITION BY user_id
ORDER BY order_date
ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW
) as running_total
FROM orders
-- Good: Optimize window function
SELECT
user_id,
order_date,
revenue,
ROW_NUMBER() OVER (PARTITION BY user_id ORDER BY order_date) as order_sequence
FROM orders
WHERE order_date >= '{{ ds }}'::date
-- Good: Use CTEs for complex logic
WITH user_totals AS (
SELECT
user_id,
SUM(revenue) as total_revenue
FROM orders
WHERE order_date >= '{{ ds }}'::date
GROUP BY user_id
)
SELECT
user_id,
total_revenue,
CASE
WHEN total_revenue > 1000 THEN 'High Value'
WHEN total_revenue > 100 THEN 'Medium Value'
ELSE 'Low Value'
END as user_tier
FROM user_totals
-- Good: Keep CTEs focused
WITH base_data AS (
SELECT user_id, order_date, revenue
FROM orders
WHERE order_date >= '{{ ds }}'::date
),
aggregated_data AS (
SELECT
user_id,
COUNT(*) as order_count,
SUM(revenue) as total_revenue
FROM base_data
GROUP BY user_id
)
SELECT * FROM aggregated_data

Problem: Slow JSON queries

Solutions:

  • Create GIN indexes on JSONB columns
  • Use appropriate JSON operators
  • Avoid complex JSON operations in WHERE clauses

Debug Steps:

  1. Check index usage
  2. Analyze query execution plan
  3. Review JSON operator usage
  4. Optimize JSON queries

Problem: Slow window functions

Solutions:

  • Optimize window frame specifications
  • Use appropriate partitioning
  • Avoid unnecessary window functions

Debug Steps:

  1. Analyze query execution plan
  2. Check window frame specifications
  3. Review partitioning strategy
  4. Optimize window function usage

Problem: Slow CTE queries

Solutions:

  • Optimize CTE logic
  • Use appropriate indexing
  • Avoid complex CTE operations

Debug Steps:

  1. Analyze CTE execution plan
  2. Check indexing strategy
  3. Review CTE logic
  4. Optimize CTE queries