SQL for pandas Users
DataStore compiles pandas-style operations into optimized SQL. This guide helps pandas users understand the SQL behind their operations.
Viewing Generated SQL
Output:
Basic Operations Mapping
Filtering (WHERE)
| pandas | SQL |
|---|---|
df[df['age'] > 25] | WHERE age > 25 |
df[df['city'] == 'NYC'] | WHERE city = 'NYC' |
df[(df['x'] > 10) & (df['y'] < 20)] | WHERE x > 10 AND y < 20 |
df[(df['a'] == 1) | (df['b'] == 2)] | WHERE a = 1 OR b = 2 |
df[~(df['status'] == 'inactive')] | WHERE NOT status = 'inactive' |
df[df['col'].isin([1, 2, 3])] | WHERE col IN (1, 2, 3) |
df[df['val'].between(10, 20)] | WHERE val BETWEEN 10 AND 20 |
df[df['name'].str.contains('John')] | WHERE position('John' IN name) > 0 |
Selection (SELECT)
| pandas | SQL |
|---|---|
df['col'] | SELECT col |
df[['a', 'b', 'c']] | SELECT a, b, c |
df.head(10) | LIMIT 10 |
df.tail(10) | Complex (ORDER BY ... DESC LIMIT 10) |
df.drop_duplicates() | SELECT DISTINCT * |
Sorting (ORDER BY)
| pandas | SQL |
|---|---|
df.sort_values('col') | ORDER BY col ASC |
df.sort_values('col', ascending=False) | ORDER BY col DESC |
df.sort_values(['a', 'b']) | ORDER BY a ASC, b ASC |
df.sort_values(['a', 'b'], ascending=[True, False]) | ORDER BY a ASC, b DESC |
df.nlargest(10, 'col') | ORDER BY col DESC LIMIT 10 |
df.nsmallest(5, 'col') | ORDER BY col ASC LIMIT 5 |
GroupBy and Aggregation
Basic GroupBy
| pandas | SQL |
|---|---|
df.groupby('city')['sales'].sum() | SELECT city, SUM(sales) FROM ... GROUP BY city |
df.groupby('city')['sales'].mean() | SELECT city, AVG(sales) FROM ... GROUP BY city |
df.groupby('city').size() | SELECT city, COUNT(*) FROM ... GROUP BY city |
df.groupby(['a', 'b'])['c'].sum() | SELECT a, b, SUM(c) FROM ... GROUP BY a, b |
Aggregation Functions
| pandas | SQL |
|---|---|
sum() | SUM() |
mean() | AVG() |
count() | COUNT() |
min() | MIN() |
max() | MAX() |
std() | stddevPop() |
var() | varPop() |
median() | MEDIAN() |
nunique() | COUNT(DISTINCT col) |
first() | any() |
last() | anyLast() |
Multiple Aggregations
HAVING Clause
Joins
| pandas | SQL |
|---|---|
pd.merge(df1, df2, on='id') | JOIN df2 ON df1.id = df2.id |
pd.merge(df1, df2, on='id', how='left') | LEFT JOIN df2 ON ... |
pd.merge(df1, df2, on='id', how='right') | RIGHT JOIN df2 ON ... |
pd.merge(df1, df2, on='id', how='outer') | FULL OUTER JOIN df2 ON ... |
pd.merge(df1, df2, left_on='a', right_on='b') | JOIN df2 ON df1.a = df2.b |
Join Example
String Operations
| pandas | SQL |
|---|---|
df['col'].str.upper() | upper(col) |
df['col'].str.lower() | lower(col) |
df['col'].str.len() | length(col) |
df['col'].str.strip() | trim(col) |
df['col'].str.contains('x') | position('x' IN col) > 0 |
df['col'].str.startswith('x') | startsWith(col, 'x') |
df['col'].str.endswith('x') | endsWith(col, 'x') |
df['col'].str.replace('a', 'b') | replace(col, 'a', 'b') |
df['col'].str[:5] | substring(col, 1, 5) |
DateTime Operations
| pandas | SQL |
|---|---|
df['date'].dt.year | toYear(date) |
df['date'].dt.month | toMonth(date) |
df['date'].dt.day | toDayOfMonth(date) |
df['date'].dt.hour | toHour(date) |
df['date'].dt.dayofweek | toDayOfWeek(date) |
df['date'].dt.quarter | toQuarter(date) |
Arithmetic Operations
| pandas | SQL |
|---|---|
df['a'] + df['b'] | a + b |
df['a'] - df['b'] | a - b |
df['a'] * df['b'] | a * b |
df['a'] / df['b'] | a / b |
df['a'] // df['b'] | intDiv(a, b) |
df['a'] % df['b'] | a % b |
df['a'] ** 2 | pow(a, 2) |
df['a'].abs() | abs(a) |
df['a'].round(2) | round(a, 2) |
NULL Handling
| pandas | SQL |
|---|---|
df['col'].isna() | isNull(col) |
df['col'].notna() | isNotNull(col) |
df.dropna() | WHERE col IS NOT NULL (for each col) |
df.fillna(0) | ifNull(col, 0) |
df.fillna({'a': 0, 'b': 'x'}) | ifNull(a, 0), ifNull(b, 'x') |
Complete Example
pandas Code
Equivalent SQL
DataStore Code
SQL Keywords Summary
| pandas Operation | SQL Clause |
|---|---|
df[condition] | WHERE |
df[['a', 'b']] | SELECT a, b |
df.groupby('x') | GROUP BY x |
.agg({'col': 'sum'}) | SUM(col) |
.sort_values('x') | ORDER BY x |
.head(n) | LIMIT n |
pd.merge() | JOIN |
.drop_duplicates() | DISTINCT |
.having() | HAVING |
Tips for pandas Users
1. Think in SQL Operations
When writing DataStore code, think about what SQL you'd want:
2. Use to_sql() to Learn
3. Leverage SQL Features
DataStore gives you SQL power with pandas syntax: