Перед вами стоит задача – подготовить аналитический отчет для HR-отдела. На основании проведенной аналитики предполагается составить рекомендации для отдела кадров по стратегии набора персонала, а также по взаимодействию с уже имеющимися сотрудниками.
В базе данных лежит набор таблиц, которые содержат данные о сотрудниках вымышленной компании.
Сделайте обзор штата сотрудников компании. Составьте набор предметов исследования, а затем проверьте их на данных. Вся аналитика должна быть выполена с помощью SQL. Впоследствии данные можно визуализировать, однако финальные датафреймы для графиков также должны быть подготовлены с помощью SQL.
Примеры гипотез:
perfomance score
и тем, под чьим руководством работает сотрудник.Оригинальный датасет на kaggle: Human Resources Data Set
1. Подготовка и настройка
1.1. Импорт библиотек
1.2. Настройка и проверка соединения с базой данных
1.3. Запросы и функции
1.3.1. Описание данных
1.3.2. Расчет корреляционной матрицы
1.3.3. Расчет сводной таблицы
1.3.4. Визуализация сводной таблицы
2. Таблица hr_dataset
2.1. Ознакомление с исходными данными
2.2. Подготовка датафрейма
2.3. Анализ корреляционных матриц
2.3.1. Матрица рассеяния
2.3.2. Коэффициент корреляции Пирсона
2.4. Анализ категориальных показателей
2.4.1. Расовый состав
2.4.2. Каналы поиска сотрудников
2.4.3. Качество каналов поиска сотрудников
3. Таблица production_staff
3.1. Ознакомление с исходными данными
3.2. Подготовка датафрейма
3.3. Анализ корреляционных матриц
3.4. Анализ категориальных показателей
4. Таблица recruiting_costs
4.1. Ознакомление с исходными данными
4.2. Подготовка датафрейма
4.3. Расходы на каналы поиска сотрудников
5. Таблица salary_grid
5.1. Ознакомление с исходными данными
5.2. Подготовка датафрейма
5.3. Выход зарплаты за границы диапазона
6. Поверка гипотез
6.1. Проверка гипотезы 1
6.2. Проверка гипотезы 2
7. Рекомендации для отдела кадров
7.1. Рекомендации по стратегии набора персонала
7.2. Рекомендации по взаимодействию с имеющимися сотрудниками
Перейти к началу аналитического отчета
Перейти к проверке гипотез
Перейти к рекомендациям
from contextlib import closing
import psycopg2
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
# Database credentials
pg_connection = {
"host": "***",
"port": 1234,
"dbname": "human_resources",
"user": "readonly",
"password": "***"
}
def read_sql(sql, creds=pg_connection, **kwargs):
with closing(psycopg2.connect(**pg_connection)) as conn:
return pd.read_sql(sql, conn, **kwargs)
read_sql("SELECT * FROM information_schema.tables WHERE table_schema='public' ORDER BY table_name;")
# Список таблиц базы данных
table_catalog | table_schema | table_name | table_type | self_referencing_column_name | reference_generation | user_defined_type_catalog | user_defined_type_schema | user_defined_type_name | is_insertable_into | is_typed | commit_action | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | human_resources | public | abu_pay | VIEW | None | None | None | None | None | NO | NO | None |
1 | human_resources | public | abupay | VIEW | None | None | None | None | None | NO | NO | None |
2 | human_resources | public | abupay1 | VIEW | None | None | None | None | None | NO | NO | None |
3 | human_resources | public | contacts | BASE TABLE | None | None | None | None | None | YES | NO | None |
4 | human_resources | public | days_work | VIEW | None | None | None | None | None | NO | NO | None |
5 | human_resources | public | female | BASE TABLE | None | None | None | None | None | YES | NO | None |
6 | human_resources | public | hr_dataset | BASE TABLE | None | None | None | None | None | YES | NO | None |
7 | human_resources | public | hr_dataset_view | VIEW | None | None | None | None | None | YES | NO | None |
8 | human_resources | public | male | BASE TABLE | None | None | None | None | None | YES | NO | None |
9 | human_resources | public | numb_of_hire | BASE TABLE | None | None | None | None | None | YES | NO | None |
10 | human_resources | public | numb_of_terms | BASE TABLE | None | None | None | None | None | YES | NO | None |
11 | human_resources | public | production_staff | BASE TABLE | None | None | None | None | None | YES | NO | None |
12 | human_resources | public | recruiting_costs | BASE TABLE | None | None | None | None | None | YES | NO | None |
13 | human_resources | public | salary_grid | BASE TABLE | None | None | None | None | None | YES | NO | None |
14 | human_resources | public | share_employee_source | BASE TABLE | None | None | None | None | None | YES | NO | None |
15 | human_resources | public | show_functions | VIEW | None | None | None | None | None | NO | NO | None |
16 | human_resources | public | tmpDB | VIEW | None | None | None | None | None | NO | NO | None |
# Сформировать SQL запрос для получения названий колонок таблицы
def new_sql_column_names(table_name):
return f"""
SELECT column_name
FROM information_schema.columns
WHERE table_schema = 'public'
AND table_name = '{table_name}';
"""
# Сформировать SQL запрос для расчета статистики по указанной колонке таблицы
def new_sql_column_stats(table_name, column_name):
return f"""
WITH probabilities(p) AS (
SELECT count(*)::numeric / (SELECT count(*) FROM "{table_name}")
FROM "{table_name}"
GROUP BY "{column_name}"
)
, column_info(data_type) AS (
SELECT data_type
FROM information_schema.columns
WHERE table_schema = 'public'
AND table_name = '{table_name}'
AND column_name = '{column_name}'
)
, most_common(common_val, common_freq) AS (
SELECT "{column_name}"::text, count(*)::real / (SELECT count(*) FROM "{table_name}")
FROM "{table_name}"
GROUP BY "{column_name}"
ORDER BY count(*) DESC
LIMIT 1
)
SELECT
'{table_name}' AS table_name
, '{column_name}' AS column_name
, (SELECT data_type FROM column_info) AS data_type
, count(DISTINCT "{column_name}") AS unique
, (SELECT common_val FROM most_common) AS common_val
, (SELECT common_freq FROM most_common) AS common_freq
, count("{column_name}") AS non_null
, count(*) AS count
, sum(pg_column_size("{column_name}")) AS size_bytes
--1st brunch is an optimisation, i.e. this CASE statement is not necessary
, CASE
WHEN count(DISTINCT "{column_name}") = count(*) THEN --no duplicates in column
log(2.0, count(*)) --max entropy
ELSE
(SELECT sum(p * log(2.0, 1/p)) FROM probabilities)
END AS entropy
FROM "{table_name}"
"""
# Сформировать один запрос из нескольких, соединенных выражением UNION ALL
def new_sql_union_all(sql_queries, prefix='', postfix=''):
""" Prefix is well suited for WITH clause, postfix is for ORDER BY clause. """
return prefix + '\n(\n' + '\n) UNION ALL (\n'.join(sql_queries) + '\n)\n' + postfix
# Функция для описания данных в таблице table_name
# Получить датафрейм со следующей информацией:
#
# table_name Имя таблицы
# col_name Имя колонки
# data_type Тип данных
# unique Число уникальных значений
# common_val Самое частовстречающееся значение
# common_freq Доля приходящаяся на самое частовстречающееся значение (от 0.0 до 1.0)
# non_null Число значений, кроме NULL
# count Число значений, включая NULL
# size_bytes Размер данных в байтах (сумма размеров всех ячеек колонки)
# entropy Информационная энтропия данных в колонке (бит)
#
def get_column_stats(table_name):
column_names = read_sql(new_sql_column_names(table_name)).column_name
stats_queries = [new_sql_column_stats(table_name, column_name) for column_name in column_names]
sql_union_all_stats = new_sql_union_all(stats_queries)
return read_sql(sql_union_all_stats)
def new_sql_count_distinct(column_names, where=None, table_name=None, subquery=None):
assert ((table_name is None) + (subquery is None)) == 1, \
'Exactly one of the table_name and subquery parameters must be specified'
field_sep = '\n , '
return f"""
SELECT {field_sep.join(f'count(DISTINCT "{column_name}") AS "{column_name}"'
for column_name in column_names)}
FROM {f'"{table_name}"' if table_name is not None else f'({subquery}) AS subquery'}
WHERE {'TRUE' if where is None else where}
"""
def new_sql_corr_matrix(column_names, where=None, seed=None, table_name=None, subquery=None):
assert ((table_name is None) + (subquery is None)) == 1, \
'Exactly one of the table_name and subquery parameters must be specified'
prefix = '' if seed is None else f'SELECT setseed({seed});'
prefix += f"""
WITH tbl_with_rand_num AS (
SELECT random() AS "__RND__", *
FROM {f'"{table_name}"' if table_name is not None else f'({subquery}) AS subquery'}
WHERE {'TRUE' if where is None else where}
)
"""
def new_sql_select_corr_columns(x_column, y_columns):
field_sep = '\n , '
return f"""
SELECT {field_sep.join(f'corr("{x_column}", "{y_column}") AS "{y_column}"'
for y_column in y_columns)}
FROM tbl_with_rand_num
"""
return new_sql_union_all((new_sql_select_corr_columns(x_column, column_names)
for x_column in column_names), prefix=prefix)
def get_corr_matrix(column_names, where=None, corr_rnd=False, seed=None,
table_name=None, subquery=None, read_sql=read_sql):
""" Produce correlation matrix between specified columns.
Parameters
----------
corr_rnd : { True | False }, default False
Include correlations with random values
seed : float, default None
Use provided value in postgres setseed(value). Allowed values are floating-point
numbers between -1 and 1. If None – setseed() is skipped
...
"""
kwargs = dict(where=where, table_name=table_name, subquery=subquery)
unique_counts = read_sql(new_sql_count_distinct(column_names, **kwargs))
columns_names_non_constant = unique_counts.T[unique_counts.T > 1].dropna().index
corr_column_names = [*columns_names_non_constant, '__RND__'] if corr_rnd else columns_names_non_constant
corr_matrix = read_sql(new_sql_corr_matrix(corr_column_names, seed=seed, **kwargs))
corr_matrix.index = corr_column_names
return corr_matrix
# Сформировать SQL запрос для получения всех уникальных значений по колонке
def new_sql_distinct_values(field, table_name=None, subquery=None, order_by=None):
""" order_by : string, default None
Expression for ORDER BY clause. If None ORDER BY clause will be omitted.
Examples: 'column_name', 'column_name DESC', 'count(*)', 'avg("Pay Rate") DESC'
"""
assert ((table_name is None) + (subquery is None)) == 1, \
'Exactly one of the table_name and subquery parameters must be specified'
return f'''
SELECT "{field}"::text AS column_name
FROM {f'"{table_name}"' if table_name is not None else f'({subquery}) AS subquery'}
GROUP BY "{field}"
{f'ORDER BY {order_by}' if order_by is not None else ''}
'''
# Сформировать SQL запрос для получения сводной таблицы (pivot table)
def new_sql_pivot_table(rows_field, columns_field, column_names,
value_expr, total_row_expr, total_col_expr,
table_name=None, subquery=None,
order_by=None, total_row_name=None, total_col_name=None):
assert ((table_name is None) + (subquery is None)) == 1, \
'Exactly one of the table_name and subquery parameters must be specified'
if total_row_name is None:
total_row_name = 'total'
if total_col_name is None:
total_col_name = 'total'
# Сформировать SQL конструкцию JOIN (JOIN clause)
def new_sql_left_join_clause(column_name):
return f'''
LEFT JOIN (
SELECT row_name, value AS "{column_name}"
FROM values_with_total
WHERE column_name = '{column_name}'
) AS "column {column_name}"
USING (row_name)
'''
return f'''
WITH row_names AS (
(
SELECT "{rows_field}"::text AS row_name
FROM {f'"{table_name}"' if table_name is not None else f'({subquery}) AS subquery'}
GROUP BY "{rows_field}"
{f"ORDER BY {order_by}" if order_by is not None else ''}
)
UNION ALL
SELECT '{total_row_name}' AS row_name
)
, values(row_name, column_name, value) AS (
SELECT "{rows_field}"::text AS row_name, "{columns_field}"::text AS column_name, {value_expr} AS value
FROM {f'"{table_name}"' if table_name is not None else f'({subquery}) AS subquery'}
GROUP BY "{rows_field}", "{columns_field}"
)
, values_with_total AS (
SELECT *
FROM values
-- values for TOTAL row
UNION ALL
SELECT '{total_row_name}' AS row_name, column_name, {total_row_expr} AS value
FROM values
GROUP BY column_name
-- values for TOTAL column
UNION ALL
SELECT row_name, '{total_col_name}' AS column_name, {total_col_expr} AS value
FROM values
GROUP BY row_name
)
SELECT
row_name
, "{total_col_name}"
, {', '.join([f'COALESCE("{column_name}", 0) AS "{column_name}"' for column_name in column_names])}
-- 1st column: row names
FROM row_names
-- 2nd column: total
{new_sql_left_join_clause(total_col_name)}
-- N columns: 1 per column name
{''.join([new_sql_left_join_clause(column_name) for column_name in column_names])}
'''
# Получить датафрейм в виде сводной таблицы
def get_pivot_table_df(rows_field, columns_field, value_expr, total_row_expr, total_col_expr,
table_name=None, subquery=None,
total_row_name=None, total_col_name=None, order_columns=None, order_rows=None,
read_sql=read_sql, out_sql_string=None):
""" Query date from database formed as pivot table.
Parameters
----------
table_name : string
SQL identifier for table in database
rows_field : string
SQL identifier for column contains row names for pivot table
columns_field : string
SQL identifier for column contains column names for pivot table
value_expr : string
SQL expression for pivot table values.
Examples: 'count(*)', 'sum("Pay Rate")', 'avg("Pay Rate")'
total_row_expr : string
SQL expression for pivot table total row values.
Examples: 'sum(value)', 'avg(value)'
total_col_expr : string
SQL expression for pivot table total column values.
Examples: 'sum(value)', 'avg(value)'
table_name : string, default None
SQL identifier will be used as table name in FROM clause.
Exactly one of the table_name and subquery parameters must be specified
subquery : string, default None
SQL expression will be used as subquery in FROM clause.
Exactly one of the table_name and subquery parameters must be specified
total_row_name : string, default None
Name for total row of the pivot table. If None default name is used: 'total'
total_col_name : string, default None
Name for total column of the pivot table. If None default name is used: 'total'
order_columns : {None, 'name', 'name DESC', 'total', 'total DESC'}, string, default None
* None: skip ordering. Order undefined
* name: order by column name
* name DESC: order by column name descending
* total: order by column's total value
* total DESC: order by column's total value descending
* string: any other string will be used as a raw SQL expression in OREDER BY clause
order_rows : {None, 'name', 'name DESC', 'total', 'total DESC'}, default None
* None: skip ordering. Order undefined
* name: order by row name
* name DESC: order by row name descending
* total: order by row's total value
* total DESC: order by row's total value descending
* string: any other string will be used as a raw SQL expression in OREDER BY clause
read_sql : function, default read_sql
Function querying data from database. Should receive SQL string as first parameter as well as
following keyword arguments: params, index_col
out_sql_string : list, default None
Append string with SQL query to provided list. Useful for debugging purposes
"""
order_by_expr = {'name': '1', 'name DESC': '1 DESC',
'total': value_expr, 'total DESC': f'{value_expr} DESC'}
# Query 1. Query column names
sql_distinct_values = new_sql_distinct_values(columns_field, table_name=table_name,
subquery=subquery,
order_by=order_by_expr.get(order_columns, order_columns))
df_column_names = read_sql(sql_distinct_values)
column_names = df_column_names.column_name
# Query 2. Query data that represents pivot table
sql_pivot_table = new_sql_pivot_table(
rows_field, columns_field, column_names, table_name=table_name, subquery=subquery,
value_expr=value_expr,
total_row_expr=total_row_expr, total_col_expr=total_col_expr,
total_row_name=total_row_name, total_col_name=total_col_name,
order_by=order_by_expr.get(order_rows, order_rows))
if out_sql_string is not None:
out_sql_string.append(sql_pivot_table)
df = read_sql(sql_pivot_table, index_col='row_name')
df.index.name = rows_field
df.columns.name = columns_field
return df
def plot_pivot_table(df, cmap='Greens', title='', fmt='.2g'):
plt.figure(figsize=(15, 15))
vmin, vmax = df.iloc[:-1, 2:].min().min(), df.iloc[:-1, 2:].max().max()
ax = sns.heatmap(df, cmap=cmap, vmin=vmin, vmax=vmax,
annot=True, linecolor=(0, 0, 0, .01), lw=1, cbar=False, square=True, fmt=fmt)
ncols = int(ax.get_xbound()[1])
for artist in ax.get_children():
if isinstance(artist, matplotlib.collections.QuadMesh):
for rgba in artist.get_facecolor()[::ncols]:
rgba[3] = 0
for rgba in artist.get_facecolor()[-ncols:]:
rgba[3] = 0
break
for text in ax.texts[::ncols]:
text.set_fontweight('bold')
text.set_color('k')
for text in ax.texts[-ncols+1:]: # +1 to compensate the first NULL in the total row
text.set_fontweight('bold')
text.set_color('k')
for text in ax.texts:
if text.get_text() in ['0', '0.0', '0.00']:
text.set_alpha(0)
plt.xticks(rotation=45, ha='right')
plt.title(title, size=24, pad=24)
# bugfix: https://github.com/mwaskom/seaborn/issues/1773
bottom, top = plt.ylim()
plt.ylim(bottom + 0.5, top - 0.5)
hr_dataset
¶Сотрудники компании
Предпросмотр исходных данных
hr_dataset_raw_preview = read_sql("SELECT * FROM hr_dataset LIMIT 5;")
hr_dataset_raw_preview.T
# Датафрейм транспонирован для удобства просмотра
0 | 1 | 2 | 3 | 4 | |
---|---|---|---|---|---|
id | 1 | 2 | 3 | 4 | 5 |
Employee Name | Brown, Mia | LaRotonda, William | Steans, Tyrone | Howard, Estelle | Singh, Nan |
Employee Number | 1103024456 | 1106026572 | 1302053333 | 1211050782 | 1307059817 |
marriedid | 1 | 0 | 0 | 1 | 0 |
maritalstatusid | 1 | 2 | 0 | 1 | 0 |
genderid | 0 | 1 | 1 | 0 | 0 |
empstatus_id | 1 | 1 | 1 | 1 | 1 |
deptid | 1 | 1 | 1 | 1 | 1 |
perf_scoreid | 3 | 3 | 3 | 9 | 9 |
age | 30 | 34 | 31 | 32 | 30 |
Pay Rate | 28.5 | 23 | 29 | 21.5 | 16.56 |
state | MA | MA | MA | MA | MA |
zip | 1450 | 1460 | 2703 | 2170 | 2330 |
dob | 1987-11-24 | 1984-04-26 | 1986-09-01 | 1985-09-16 | 1988-05-19 |
sex | Female | Male | Male | Female | Female |
maritaldesc | Married | Divorced | Single | Married | Single |
citizendesc | US Citizen | US Citizen | US Citizen | US Citizen | US Citizen |
Hispanic/Latino | No | No | No | No | No |
racedesc | Black or African American | Black or African American | White | White | White |
Date of Hire | 2008-10-27 | 2014-01-06 | 2014-09-29 | 2015-02-16 | 2015-05-01 |
Days Employed | 3317 | 1420 | 1154 | 58 | 940 |
Date of Termination | None | None | None | 2015-04-15 | None |
Reason For Term | N/A - still employed | N/A - still employed | N/A - still employed | N/A - still employed | N/A - still employed |
Employment Status | Active | Active | Active | Active | Active |
department | Admin Offices | Admin Offices | Admin Offices | Admin Offices | Admin Offices |
position | Accountant I | Accountant I | Accountant I | Administrative Assistant | Administrative Assistant |
Manager Name | Brandon R. LeBlanc | Brandon R. LeBlanc | Brandon R. LeBlanc | Brandon R. LeBlanc | Brandon R. LeBlanc |
Employee Source | Diversity Job Fair | Website Banner Ads | Internet Search | Pay Per Click - Google | Website Banner Ads |
Performance Score | Fully Meets | Fully Meets | Fully Meets | N/A- too early to review | N/A- too early to review |
Описание данных
get_column_stats('hr_dataset')
# Характеристики каждой колонки входного датафрейма
# table_name Имя таблицы
# col_name Имя колонки
# data_type Тип данных
# unique Число уникальных значений
# common_val Самое частовстречающееся значение
# common_freq Доля приходящаяся на самое частовстречающееся значение (от 0.0 до 1.0)
# non_null Число значений, кроме NULL
# count Число значений, включая NULL
# size_bytes Размер данных в байтах (сумма размеров всех ячеек колонки)
# entropy Информационная энтропия данных в колонке (бит)
table_name | column_name | data_type | unique | common_val | common_freq | non_null | count | size_bytes | entropy | |
---|---|---|---|---|---|---|---|---|---|---|
0 | hr_dataset | id | integer | 310 | 184 | 0.003226 | 310 | 310 | 1240 | 8.276124 |
1 | hr_dataset | Employee Name | character varying | 310 | Martins, Joseph | 0.003226 | 310 | 310 | 4961 | 8.276124 |
2 | hr_dataset | Employee Number | integer | 309 | 1204033041 | 0.006452 | 310 | 310 | 1240 | 8.269673 |
3 | hr_dataset | marriedid | integer | 2 | 0 | 0.603226 | 310 | 310 | 1240 | 0.969032 |
4 | hr_dataset | maritalstatusid | integer | 5 | 0 | 0.441935 | 310 | 310 | 1240 | 1.693589 |
5 | hr_dataset | genderid | integer | 2 | 0 | 0.570968 | 310 | 310 | 1240 | 0.985419 |
6 | hr_dataset | empstatus_id | integer | 5 | 1 | 0.590323 | 310 | 310 | 1240 | 1.539148 |
7 | hr_dataset | deptid | integer | 6 | 5 | 0.670968 | 310 | 310 | 1240 | 1.489341 |
8 | hr_dataset | perf_scoreid | integer | 7 | 3 | 0.583871 | 310 | 310 | 1240 | 1.972673 |
9 | hr_dataset | age | integer | 39 | 31 | 0.080645 | 310 | 310 | 1240 | 4.883800 |
10 | hr_dataset | Pay Rate | real | 94 | 55 | 0.080645 | 310 | 310 | 1240 | 5.572627 |
11 | hr_dataset | state | character | 28 | MA | 0.887097 | 310 | 310 | 930 | 1.015902 |
12 | hr_dataset | zip | integer | 158 | 1886 | 0.041935 | 310 | 310 | 1240 | 6.943452 |
13 | hr_dataset | dob | date | 306 | 1984-07-07 | 0.006452 | 310 | 310 | 1240 | 8.250318 |
14 | hr_dataset | sex | character varying | 2 | Female | 0.570968 | 310 | 310 | 1904 | 0.985419 |
15 | hr_dataset | maritaldesc | character varying | 5 | Single | 0.441935 | 310 | 310 | 2397 | 1.693589 |
16 | hr_dataset | citizendesc | character varying | 3 | US Citizen | 0.948387 | 310 | 310 | 3522 | 0.335082 |
17 | hr_dataset | Hispanic/Latino | character varying | 4 | No | 0.906452 | 310 | 310 | 958 | 0.488525 |
18 | hr_dataset | racedesc | character varying | 6 | White | 0.622581 | 310 | 310 | 3336 | 1.624991 |
19 | hr_dataset | Date of Hire | date | 99 | 2011-01-10 | 0.045161 | 310 | 310 | 1240 | 6.023224 |
20 | hr_dataset | Days Employed | integer | 181 | 972 | 0.035484 | 310 | 310 | 1240 | 7.021000 |
21 | hr_dataset | Date of Termination | date | 93 | None | 0.667742 | 103 | 310 | 412 | 3.074344 |
22 | hr_dataset | Reason For Term | character varying | 17 | N/A - still employed | 0.635484 | 310 | 310 | 5785 | 2.290359 |
23 | hr_dataset | Employment Status | character varying | 5 | Active | 0.590323 | 310 | 310 | 3980 | 1.539148 |
24 | hr_dataset | department | character varying | 6 | Production | 0.670968 | 310 | 310 | 4597 | 1.489341 |
25 | hr_dataset | position | character varying | 28 | Production Technician I | 0.438710 | 310 | 310 | 6810 | 2.973388 |
26 | hr_dataset | Manager Name | character varying | 21 | Michael Albert | 0.070968 | 310 | 310 | 4235 | 4.195093 |
27 | hr_dataset | Employee Source | character varying | 23 | Employee Referral | 0.100000 | 310 | 310 | 5660 | 4.169512 |
28 | hr_dataset | Performance Score | character varying | 7 | Fully Meets | 0.583871 | 310 | 310 | 4138 | 1.972673 |
Описание полей таблицы hr_dataset
Field | Type | Description |
---|---|---|
id | Integer | ID строки |
Employee Name | Text | Фамилия, имя сотрудника, разделенные запятой. Пример: Howard, Estelle |
Employee Number | Integer | Уникальный номер сотрудника |
genderid | Integer | ID значения поля sex |
sex | Text | Пол: Male | Female |
marriedid | Integer | Состоит в браке (1) / не состоит (0) |
maritalstatusid | Integer | ID значения поля maritaldesc |
maritaldesc | Text | Семейное положение: Widowed | Married | Separated | Single | Divorced |
empstatus_id | Integer | ID значения поля Employment Status |
Employment Status | Text | Статус трудовых отношений: |
- Future Start – Испытательный срок | ||
- Voluntarily Terminated – Уволен по собственному желанию | ||
- Active – Действующий сотрудник | ||
- Terminated for Cause – Уволен по какой-либо причине | ||
- Leave of Absence – Отпуск | ||
deptid | Integer | ID значения поля department |
department | Text | Название отдела: Production | Admin Offices | IT/IS | Executive Office | Software Engineering | Sales |
perf_scoreid | Integer | ID значения поля Performance Score |
Performance Score | Text | Оценка производительности: |
- N/A- too early to review – Недостаточный срок для оценки | ||
- Exceptional – Исключительная | ||
- Exceeds – Превосходная | ||
- Fully Meets – Удовлетворительная | ||
- 90-day meets – Прошел испытательный срок | ||
- Needs Improvement – Неудовлетворительная | ||
- PIP – Performance Improvement Plan / План индивидуального развития | ||
Pay Rate | Real | Тарифная ставка |
state | Text | Штат проживания |
zip | Integer | Почтовый индекс |
dob | Date | Date of Birth / Дата рождения |
age | Integer | Возраст |
citizendesc | Text | Статус гражданина: |
- Eligible NonCitizen – Законно находится на территории США, не является гражданином | ||
- Non-Citizen – Не является гражданином США | ||
- US Citizen – Гражданин США | ||
Hispanic/Latino | Text | Является латиноамериканцем: No | no | Yes | yes |
racedesc | Text | Расовая принадлежность: |
- American Indian or Alaska Native – Индеец или уроженец Аляски | ||
- Hispanic – Латиноамериканец | ||
- Two or more races – Две и более расы | ||
- Asian – Азиат | ||
- White – Европеец | ||
- Black or African American – Темнокожий или афроамериканец | ||
Date of Hire | Date | Дата приема на работу |
Days Employed | Integer | Срок работы |
Date of Termination | Date | Дата увольнения |
Reason For Term | Text | Причина увольнения: |
- N/A - still employed – Не уволен | ||
- N/A - Has not started yet – Еще не приступил к работе | ||
- attendance – Посещаемость | ||
- more money – Недостаточная зарплата | ||
- medical issues – Проблемы со здоровьем | ||
- maternity leave - did not return – Декретный отпуск - не вернулся | ||
- return to school – Вернулся в школу | ||
- relocation out of area – Переезд | ||
- performance – Производительность труда | ||
- gross misconduct – Грубое нарушение | ||
- military – Воинская обязанность | ||
- hours – Неудобные рабочие часы | ||
- career change – Смена карьеры | ||
- unhappy – Неудовлетворенность | ||
- retiring – Уход на пенсию | ||
- Another position – Смена позиции в компании | ||
- no-call, no-show – Не отвечал на звонки, не явился на работу | ||
position | Text | Позиция в компании. Пример: Production Technician I |
Manager Name | Text | Имя менеджера. Пример: Michael Albert |
Employee Source | Text | Канал поиска соискателей. Пример: Employee Referral |
План подготовки:
Yes | yes
→ 1
, No | no
→ 0
.Employment Status
→ emp_status
).Подробный план подготовки датафрейма hr_dataset
id
– идентификаторEmployee Name
– идентификатор, ключевое поле для соединения с таблицей production_staff
Employee Number
– идентификаторgenderid
– дублирует sex
maritalstatusid
– дублирует maritaldesc
empstatus_id
– дублирует Employment Status
deptid
– дублирует department
perf_scoreid
– дублирует Performance Score
dob
(дата рождения) – частично дублируется информацией из поля age
maritaldesc
– переименовать в marital_desc
Employment Status
– переименовать в emp_status
department
state
zip
– привести к строковому типуcitizendesc
– переименовать в citizen_desc
racedesc
– переименовать в race_desc
Reason For Term
– переименовать в term_reason
position
Manager Name
– переименовать в manager_name
Employee Source
– переименовать в emp_source
Performance Score
– переименовать в perf_desc
sex
– переименовать в gender_desc
genderid
– переименовать в is_male
marriedid
– переименовать в is_married
Hispanic/Latino
– переименовать в is_latino
Date of Termination
is_termd
is_termd_cause
= 1
если сотрудник уволен не по собственному желаниюPerformance Score
Fully Meets
→ 0
, Exceeds
→ 1
, Exceptional
→ 2
, ...)perf_score
perf_level
= { low | average | high }
(perf_score { < | = | > } 0
)Pay Rate
– переименовать в pay_rate
age
Days Employed
– переименовать в days_emp
Date of Hire
– переименовать в hire_date
Согласно плану подготовки датафрейма, составляется SQL запрос.
# Запрос с подготовкой данных
SQL_HR_DATASET_SUBQUERY = """
WITH hr_dataset__subq_1 AS (
SELECT
--categorical
"maritaldesc" AS marital_desc
, "Employment Status" AS emp_status
, "department" AS department
, "state" AS state
, "zip"::text AS zip
, "citizendesc" AS citizen_desc
, "racedesc" AS race_desc
, "Reason For Term" AS term_reason
, "position" AS position
, "Manager Name" AS manager_name
, "Employee Source" AS emp_source
, "Performance Score" AS perf_desc
, "sex" AS gender_desc
--binary
, ("sex" IN ('Male'))::int AS is_male
, "marriedid" AS is_married
, "Hispanic/Latino"::boolean::int AS is_latino
, ("Date of Termination" IS NOT NULL)::int AS is_termd
, ("Employment Status" = 'Terminated for Cause')::int AS is_termd_cause
--ordinal
, (CASE
WHEN "Performance Score" = 'Exceptional' THEN 2
WHEN "Performance Score" = 'Exceeds' THEN 1
WHEN "Performance Score" = 'Needs Improvement' THEN -1
WHEN "Performance Score" = 'PIP' THEN -2
ELSE 0
END) AS perf_score
--numeric
, "Pay Rate" AS pay_rate
, "age" AS age
, "Days Employed" AS days_emp
--datetime
, "Date of Hire" AS hire_date
FROM hr_dataset
)
SELECT
*
, (CASE
WHEN perf_score < 0 THEN 'low'
WHEN perf_score = 0 THEN 'average'
WHEN perf_score > 0 THEN 'high'
END) AS perf_level
FROM hr_dataset__subq_1
"""
hr_dataset_preview = read_sql(f"SELECT * FROM ({SQL_HR_DATASET_SUBQUERY}) t LIMIT 5")
hr_dataset_preview.T
# Пример подготовленных (refined) данных в виде датафрейма
0 | 1 | 2 | 3 | 4 | |
---|---|---|---|---|---|
marital_desc | Married | Divorced | Single | Married | Single |
emp_status | Active | Active | Active | Active | Active |
department | Admin Offices | Admin Offices | Admin Offices | Admin Offices | Admin Offices |
state | MA | MA | MA | MA | MA |
zip | 1450 | 1460 | 2703 | 2170 | 2330 |
citizen_desc | US Citizen | US Citizen | US Citizen | US Citizen | US Citizen |
race_desc | Black or African American | Black or African American | White | White | White |
term_reason | N/A - still employed | N/A - still employed | N/A - still employed | N/A - still employed | N/A - still employed |
position | Accountant I | Accountant I | Accountant I | Administrative Assistant | Administrative Assistant |
manager_name | Brandon R. LeBlanc | Brandon R. LeBlanc | Brandon R. LeBlanc | Brandon R. LeBlanc | Brandon R. LeBlanc |
emp_source | Diversity Job Fair | Website Banner Ads | Internet Search | Pay Per Click - Google | Website Banner Ads |
perf_desc | Fully Meets | Fully Meets | Fully Meets | N/A- too early to review | N/A- too early to review |
gender_desc | Female | Male | Male | Female | Female |
is_male | 0 | 1 | 1 | 0 | 0 |
is_married | 1 | 0 | 0 | 1 | 0 |
is_latino | 0 | 0 | 0 | 0 | 0 |
is_termd | 0 | 0 | 0 | 1 | 0 |
is_termd_cause | 0 | 0 | 0 | 0 | 0 |
perf_score | 0 | 0 | 0 | 0 | 0 |
pay_rate | 28.5 | 23 | 29 | 21.5 | 16.56 |
age | 30 | 34 | 31 | 32 | 30 |
days_emp | 3317 | 1420 | 1154 | 58 | 940 |
hire_date | 2008-10-27 | 2014-01-06 | 2014-09-29 | 2015-02-16 | 2015-05-01 |
perf_level | average | average | average | average | average |
Scatter matrix (матрица рассеяния) подходит для визуального поиска корреляций среди числовых (numeric) показателей.
hr_numerical_features = read_sql(f"SELECT perf_score, pay_rate, age, days_emp FROM ({SQL_HR_DATASET_SUBQUERY}) hr")
pd.plotting.scatter_matrix(hr_numerical_features, figsize=(15, 15))
pass
Наблюдения:
Можно использовать для числовых (numeric) и бинарных ([0, 1]) показателей.
Действующие сотрудники
corr_columns = ['is_male', 'is_married', 'is_latino', 'is_termd', 'is_termd_cause',
'perf_score', 'pay_rate', 'age', 'days_emp']
df_corr_non_termd = get_corr_matrix(corr_columns, where='is_termd = 0',
corr_rnd=True, seed=0, subquery=SQL_HR_DATASET_SUBQUERY)
df_corr_non_termd.style.background_gradient(cmap='coolwarm', axis=None).format('{:.2f}')
# Замечание: если градиентная раскраска матрицы не отображается после перезагрузки страницы,
# нужно заново выполнить текущую ячейку (Ctrl+Enter).
is_male | is_married | is_latino | perf_score | pay_rate | age | days_emp | __RND__ | |
---|---|---|---|---|---|---|---|---|
is_male | 1.00 | -0.03 | 0.02 | -0.03 | 0.13 | 0.01 | -0.07 | 0.05 |
is_married | -0.03 | 1.00 | -0.04 | -0.11 | 0.04 | -0.04 | -0.03 | 0.02 |
is_latino | 0.02 | -0.04 | 1.00 | -0.08 | 0.03 | -0.07 | 0.04 | 0.01 |
perf_score | -0.03 | -0.11 | -0.08 | 1.00 | -0.01 | 0.02 | 0.06 | -0.05 |
pay_rate | 0.13 | 0.04 | 0.03 | -0.01 | 1.00 | 0.02 | -0.07 | 0.04 |
age | 0.01 | -0.04 | -0.07 | 0.02 | 0.02 | 1.00 | 0.05 | -0.04 |
days_emp | -0.07 | -0.03 | 0.04 | 0.06 | -0.07 | 0.05 | 1.00 | -0.07 |
__RND__ | 0.05 | 0.02 | 0.01 | -0.05 | 0.04 | -0.04 | -0.07 | 1.00 |
Пояснения:
__RND__
содержит коэффициент корреляции соответствующего показателя со случайным числом.__RND__
, следует рассматривать как недостоверные (случайная корреляция).Наблюдения:
pay_rate
$\uparrow\uparrow$ is_male
(0.13): ставка выше у сотрудников мужского пола.is_married
$\uparrow\mathrel{\mspace{-1mu}}\downarrow$ perf_score
(-0.11): у сотрудников состоящих в браке оценка труда в среднем ниже.perf_score
$\parallel$ Pay Rate
(-0.01): размер ставки не связан с оценкой производительности.age
, days_emp
: возраст и срок работы не имеют значительной корреляции с другими показателями.Уволенные и уволившиеся сотрудники
df_corr_termd = get_corr_matrix(corr_columns, where='is_termd = 1',
corr_rnd=True, seed=0, subquery=SQL_HR_DATASET_SUBQUERY)
df_corr_termd.style.background_gradient(cmap='coolwarm', axis=None).format('{:.2f}')
is_male | is_married | is_latino | is_termd_cause | perf_score | pay_rate | age | days_emp | __RND__ | |
---|---|---|---|---|---|---|---|---|---|
is_male | 1.00 | -0.01 | 0.02 | 0.01 | -0.06 | 0.16 | -0.04 | 0.03 | -0.01 |
is_married | -0.01 | 1.00 | -0.14 | -0.07 | 0.02 | 0.04 | -0.00 | 0.00 | -0.06 |
is_latino | 0.02 | -0.14 | 1.00 | -0.02 | 0.11 | 0.01 | -0.00 | -0.04 | 0.02 |
is_termd_cause | 0.01 | -0.07 | -0.02 | 1.00 | -0.20 | 0.19 | -0.11 | -0.03 | 0.03 |
perf_score | -0.06 | 0.02 | 0.11 | -0.20 | 1.00 | 0.06 | 0.10 | -0.05 | 0.01 |
pay_rate | 0.16 | 0.04 | 0.01 | 0.19 | 0.06 | 1.00 | 0.09 | -0.03 | 0.19 |
age | -0.04 | -0.00 | -0.00 | -0.11 | 0.10 | 0.09 | 1.00 | 0.13 | 0.10 |
days_emp | 0.03 | 0.00 | -0.04 | -0.03 | -0.05 | -0.03 | 0.13 | 1.00 | 0.03 |
__RND__ | -0.01 | -0.06 | 0.02 | 0.03 | 0.01 | 0.19 | 0.10 | 0.03 | 1.00 |
Наблюдения:
perf_score
$\uparrow\mathrel{\mspace{-1mu}}\downarrow$ is_termd_cause
(-0.20): увольнения по собственному желанию чаще происходили среди сотрудников с более высокой оценкой труда.pay_rate
$\uparrow\uparrow$ is_termd_cause
(0.19): у сотрудников уволенных за нарушение ставка в среднем выше, чем у сотрудников уволившихся по собственному желанию.pay_rate
с is_termd_cause
такая же (0.19) как со случайной величиной (__RND__
). Это значит, что не следует воспринимать это наблюдение как закономерность.age
$\uparrow\mathrel{\mspace{-1mu}}\downarrow$ is_termd_cause
(-0.11): увольнения за нарушения чаще происходили среди более молодых сотрудников.Сотрудники уволившиеся по собственному желанию
df_corr_termd_no_cause = get_corr_matrix(corr_columns, where='is_termd = 1 AND is_termd_cause = 0',
corr_rnd=True, seed=0, subquery=SQL_HR_DATASET_SUBQUERY)
df_corr_termd_no_cause.style.background_gradient(cmap='coolwarm', axis=None).format('{:.2f}')
is_male | is_married | is_latino | perf_score | pay_rate | age | days_emp | __RND__ | |
---|---|---|---|---|---|---|---|---|
is_male | 1.00 | -0.00 | 0.05 | -0.08 | 0.21 | -0.07 | 0.04 | 0.19 |
is_married | -0.00 | 1.00 | -0.13 | -0.03 | 0.05 | 0.02 | -0.09 | 0.18 |
is_latino | 0.05 | -0.13 | 1.00 | 0.10 | -0.01 | 0.02 | -0.02 | 0.09 |
perf_score | -0.08 | -0.03 | 0.10 | 1.00 | 0.03 | 0.08 | -0.06 | -0.04 |
pay_rate | 0.21 | 0.05 | -0.01 | 0.03 | 1.00 | 0.04 | -0.00 | 0.10 |
age | -0.07 | 0.02 | 0.02 | 0.08 | 0.04 | 1.00 | 0.14 | 0.01 |
days_emp | 0.04 | -0.09 | -0.02 | -0.06 | -0.00 | 0.14 | 1.00 | -0.17 |
__RND__ | 0.19 | 0.18 | 0.09 | -0.04 | 0.10 | 0.01 | -0.17 | 1.00 |
Наблюдения:
is_male
$\uparrow\uparrow$ pay_rate
(0.21): увольнения по собственному желанию сотрудников с высокой ставкой, чаще происходили среди сотрудников мужского пола.Сотрудники уволенные за нарушение
df_corr_termd_cause = get_corr_matrix(corr_columns, where='is_termd = 1 AND is_termd_cause = 1',
corr_rnd=True, seed=0, subquery=SQL_HR_DATASET_SUBQUERY)
df_corr_termd_cause.style.background_gradient(cmap='coolwarm', axis=None).format('{:.2f}')
is_male | is_married | is_latino | perf_score | pay_rate | age | days_emp | __RND__ | |
---|---|---|---|---|---|---|---|---|
is_male | 1.00 | -0.04 | -0.24 | 0.03 | -0.04 | 0.26 | -0.06 | -0.04 |
is_married | -0.04 | 1.00 | -0.21 | 0.16 | 0.08 | -0.31 | 0.60 | 0.34 |
is_latino | -0.24 | -0.21 | 1.00 | 0.14 | 0.17 | -0.21 | -0.12 | 0.35 |
perf_score | 0.03 | 0.16 | 0.14 | 1.00 | 0.32 | 0.12 | -0.07 | 0.36 |
pay_rate | -0.04 | 0.08 | 0.17 | 0.32 | 1.00 | 0.60 | -0.12 | 0.41 |
age | 0.26 | -0.31 | -0.21 | 0.12 | 0.60 | 1.00 | 0.02 | -0.01 |
days_emp | -0.06 | 0.60 | -0.12 | -0.07 | -0.12 | 0.02 | 1.00 | 0.17 |
__RND__ | -0.04 | 0.34 | 0.35 | 0.36 | 0.41 | -0.01 | 0.17 | 1.00 |
Наблюдения:
pay_rate
$\uparrow\uparrow$ perf_score
(0.32): у сотрудников с более высокой ставкой уволенных за нарушение, оценка производительности выше.age
$\uparrow\uparrow$ is_male
(0.26): сотрудники мужского пола уволенные за нарушение, в стреднем старше.age
$\uparrow\mathrel{\mspace{-1mu}}\downarrow$ is_married
(-0.31): сотрудники состоящие в браке уволенные за нарушение, в стреднем моложе.age
$\uparrow\uparrow$ pay_rate
(0.60): сотрудники с более высокой ставкой уволенные за нарушение, в стреднем старше.days_emp
$\uparrow\uparrow$ is_married
(0.60): сотрудники с большим сроком работы уволенные за нарушение, чаще состоят в браке.__RND__
являются индикатором дефицита данных (14 строк в выборке).sql_query_dump = []
position_race_count = get_pivot_table_df('position', 'race_desc',
'count(*) / (sum(count(*)) OVER ()) * 100', total_row_expr='sum(value)', total_col_expr='sum(value)',
subquery=SQL_HR_DATASET_SUBQUERY, order_columns='total DESC', order_rows='total', out_sql_string=sql_query_dump)
print(sql_query_dump[0])
# Демонстрация динамически сформированного SQL запроса
WITH row_names AS ( ( SELECT "position"::text AS row_name FROM ( WITH hr_dataset__subq_1 AS ( SELECT --categorical "maritaldesc" AS marital_desc , "Employment Status" AS emp_status , "department" AS department , "state" AS state , "zip"::text AS zip , "citizendesc" AS citizen_desc , "racedesc" AS race_desc , "Reason For Term" AS term_reason , "position" AS position , "Manager Name" AS manager_name , "Employee Source" AS emp_source , "Performance Score" AS perf_desc , "sex" AS gender_desc --binary , ("sex" IN ('Male'))::int AS is_male , "marriedid" AS is_married , "Hispanic/Latino"::boolean::int AS is_latino , ("Date of Termination" IS NOT NULL)::int AS is_termd , ("Employment Status" = 'Terminated for Cause')::int AS is_termd_cause --ordinal , (CASE WHEN "Performance Score" = 'Exceptional' THEN 2 WHEN "Performance Score" = 'Exceeds' THEN 1 WHEN "Performance Score" = 'Needs Improvement' THEN -1 WHEN "Performance Score" = 'PIP' THEN -2 ELSE 0 END) AS perf_score --numeric , "Pay Rate" AS pay_rate , "age" AS age , "Days Employed" AS days_emp --datetime , "Date of Hire" AS hire_date FROM hr_dataset ) SELECT * , (CASE WHEN perf_score < 0 THEN 'low' WHEN perf_score = 0 THEN 'average' WHEN perf_score > 0 THEN 'high' END) AS perf_level FROM hr_dataset__subq_1 ) AS subquery GROUP BY "position" ORDER BY count(*) / (sum(count(*)) OVER ()) * 100 ) UNION ALL SELECT 'total' AS row_name ) , values(row_name, column_name, value) AS ( SELECT "position"::text AS row_name, "race_desc"::text AS column_name, count(*) / (sum(count(*)) OVER ()) * 100 AS value FROM ( WITH hr_dataset__subq_1 AS ( SELECT --categorical "maritaldesc" AS marital_desc , "Employment Status" AS emp_status , "department" AS department , "state" AS state , "zip"::text AS zip , "citizendesc" AS citizen_desc , "racedesc" AS race_desc , "Reason For Term" AS term_reason , "position" AS position , "Manager Name" AS manager_name , "Employee Source" AS emp_source , "Performance Score" AS perf_desc , "sex" AS gender_desc --binary , ("sex" IN ('Male'))::int AS is_male , "marriedid" AS is_married , "Hispanic/Latino"::boolean::int AS is_latino , ("Date of Termination" IS NOT NULL)::int AS is_termd , ("Employment Status" = 'Terminated for Cause')::int AS is_termd_cause --ordinal , (CASE WHEN "Performance Score" = 'Exceptional' THEN 2 WHEN "Performance Score" = 'Exceeds' THEN 1 WHEN "Performance Score" = 'Needs Improvement' THEN -1 WHEN "Performance Score" = 'PIP' THEN -2 ELSE 0 END) AS perf_score --numeric , "Pay Rate" AS pay_rate , "age" AS age , "Days Employed" AS days_emp --datetime , "Date of Hire" AS hire_date FROM hr_dataset ) SELECT * , (CASE WHEN perf_score < 0 THEN 'low' WHEN perf_score = 0 THEN 'average' WHEN perf_score > 0 THEN 'high' END) AS perf_level FROM hr_dataset__subq_1 ) AS subquery GROUP BY "position", "race_desc" ) , values_with_total AS ( SELECT * FROM values -- values for TOTAL row UNION ALL SELECT 'total' AS row_name, column_name, sum(value) AS value FROM values GROUP BY column_name -- values for TOTAL column UNION ALL SELECT row_name, 'total' AS column_name, sum(value) AS value FROM values GROUP BY row_name ) SELECT row_name , "total" , COALESCE("White", 0) AS "White", COALESCE("Black or African American", 0) AS "Black or African American", COALESCE("Asian", 0) AS "Asian", COALESCE("Two or more races", 0) AS "Two or more races", COALESCE("Hispanic", 0) AS "Hispanic", COALESCE("American Indian or Alaska Native", 0) AS "American Indian or Alaska Native" -- 1st column: row names FROM row_names -- 2nd column: total LEFT JOIN ( SELECT row_name, value AS "total" FROM values_with_total WHERE column_name = 'total' ) AS "column total" USING (row_name) -- N columns: 1 per column name LEFT JOIN ( SELECT row_name, value AS "White" FROM values_with_total WHERE column_name = 'White' ) AS "column White" USING (row_name) LEFT JOIN ( SELECT row_name, value AS "Black or African American" FROM values_with_total WHERE column_name = 'Black or African American' ) AS "column Black or African American" USING (row_name) LEFT JOIN ( SELECT row_name, value AS "Asian" FROM values_with_total WHERE column_name = 'Asian' ) AS "column Asian" USING (row_name) LEFT JOIN ( SELECT row_name, value AS "Two or more races" FROM values_with_total WHERE column_name = 'Two or more races' ) AS "column Two or more races" USING (row_name) LEFT JOIN ( SELECT row_name, value AS "Hispanic" FROM values_with_total WHERE column_name = 'Hispanic' ) AS "column Hispanic" USING (row_name) LEFT JOIN ( SELECT row_name, value AS "American Indian or Alaska Native" FROM values_with_total WHERE column_name = 'American Indian or Alaska Native' ) AS "column American Indian or Alaska Native" USING (row_name)
position_race_count
# Демонстрация датафрейма полученного с помощью SQL запроса
race_desc | total | White | Black or African American | Asian | Two or more races | Hispanic | American Indian or Alaska Native |
---|---|---|---|---|---|---|---|
position | |||||||
IT Manager - Infra | 0.322581 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.322581 | 0.000000 |
Director of Sales | 0.322581 | 0.322581 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
CIO | 0.322581 | 0.322581 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
Director of Operations | 0.322581 | 0.000000 | 0.322581 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
IT Manager - Support | 0.322581 | 0.322581 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
Data Architect | 0.322581 | 0.000000 | 0.322581 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
President & CEO | 0.322581 | 0.322581 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
BI Director | 0.322581 | 0.322581 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
IT Director | 0.322581 | 0.000000 | 0.322581 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
Software Engineering Manager | 0.322581 | 0.322581 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
IT Manager - DB | 0.645161 | 0.322581 | 0.000000 | 0.000000 | 0.322581 | 0.000000 | 0.000000 |
Sr. Accountant | 0.645161 | 0.322581 | 0.000000 | 0.322581 | 0.000000 | 0.000000 | 0.000000 |
Shared Services Manager | 0.645161 | 0.322581 | 0.322581 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
Administrative Assistant | 0.967742 | 0.645161 | 0.000000 | 0.322581 | 0.000000 | 0.000000 | 0.000000 |
Senior BI Developer | 0.967742 | 0.000000 | 0.000000 | 0.967742 | 0.000000 | 0.000000 | 0.000000 |
Sales Manager | 0.967742 | 0.645161 | 0.322581 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
Accountant I | 0.967742 | 0.322581 | 0.645161 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
IT Support | 1.290323 | 0.645161 | 0.322581 | 0.000000 | 0.322581 | 0.000000 | 0.000000 |
Sr. DBA | 1.290323 | 0.645161 | 0.645161 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
BI Developer | 1.290323 | 0.645161 | 0.645161 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
Sr. Network Engineer | 1.612903 | 0.967742 | 0.000000 | 0.645161 | 0.000000 | 0.000000 | 0.000000 |
Software Engineer | 2.903226 | 1.290323 | 0.967742 | 0.645161 | 0.000000 | 0.000000 | 0.000000 |
Network Engineer | 2.903226 | 2.903226 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
Database Administrator | 4.193548 | 2.580645 | 0.967742 | 0.645161 | 0.000000 | 0.000000 | 0.000000 |
Production Manager | 4.516129 | 2.903226 | 0.645161 | 0.645161 | 0.000000 | 0.322581 | 0.000000 |
Area Sales Manager | 8.709677 | 3.548387 | 2.580645 | 0.322581 | 1.935484 | 0.000000 | 0.322581 |
Production Technician II | 18.387097 | 12.580645 | 2.903226 | 1.290323 | 0.645161 | 0.322581 | 0.645161 |
Production Technician I | 43.870968 | 29.032258 | 6.451613 | 5.161290 | 2.580645 | 0.322581 | 0.322581 |
total | NaN | 62.258065 | 18.387097 | 10.967742 | 5.806452 | 1.290323 | 1.290323 |
plot_pivot_table(position_race_count,
title='Расовый состав персонала, (%)',
cmap='pink_r', fmt='.1f')
plt.xlabel('Раса')
plt.ylabel('Должность')
pass
column_order = ['White', 'Black or African American', 'Asian', 'Hispanic',
'American Indian or Alaska Native', 'Two or more races']
position__race__pay_rate = get_pivot_table_df('position', 'race_desc',
'avg(pay_rate)', total_row_expr='avg(value)', total_col_expr='avg(value)',
subquery=SQL_HR_DATASET_SUBQUERY,
order_columns=f"array_position(ARRAY{column_order}, race_desc::text)", order_rows='total')
plot_pivot_table(position__race__pay_rate,
title='Средний размер ставки сотрудников',
cmap='pink_r', fmt='.1f')
plt.xlabel('Раса')
plt.ylabel('Должность')
pass
Наблюдения:
Production Technician I/II
).Software Engineer / Database Administrator / Sr. Network Engineer
.Accountant I / IT Support / Sales Manager / Sr. DBA
.Production Technician I/II
).Accountant I / Production Manager / BI Developer
.Production Technician I/II
.Production Technician I/II
).Database Administrator / Software Engineer / Sr. Network Engineer
.position__emp_source__count = get_pivot_table_df('position', 'Employee Source',
'count(*) / (sum(count(*)) OVER ()) * 100', total_row_expr='sum(value)', total_col_expr='sum(value)',
table_name='hr_dataset', order_columns='total DESC', order_rows='total')
plot_pivot_table(position__emp_source__count,
title='Эффективность каналов поиска сотрудников, (%)',
cmap='Oranges', fmt='.1f')
Наблюдения:
Employee Referral
/ рекомендации от сотрудников, Vendor Referral
/ рекомендации от поставщиков, Word of Mouth
/ из уст в уста). Этот метод находится в числе наиболее эффективных способов поиска востребованных в компании специалистовDiversity Job Fair
) выгодно отличается разнообразием (diversity) специалистовIndeed
) является единственным каналом поиска редких специалистов следующих направлений:perf_score__emp_source__count = get_pivot_table_df('perf_score', 'emp_source',
'count(*)',
total_row_expr="sum(value * (row_name::int > 0)::int - value * (row_name::int < 0)::int)",
total_col_expr="count(value)",
total_row_name='SOURCE SCORE', total_col_name='COUNT',
subquery=SQL_HR_DATASET_SUBQUERY + 'order by perf_score',
order_columns='total DESC', order_rows="perf_score DESC")
perf_score_map = {'2': 'Exceptional', '1': 'Exceeds', '0': 'Other', '-1': 'Needs Improvement', '-2': 'PIP'}
plot_pivot_table(perf_score__emp_source__count.rename(index=perf_score_map),
title='Качество каналов поиска сотрудников',
cmap='Blues', fmt='.0f')
plt.yticks(rotation=0)
plt.xlabel('Employee Source')
plt.ylabel('Performance')
pass
Описание метрики:
SOURCE SCORE
– оценка качества канала поиска сотрудников. Вычисляется как разница между числом оценок выше нуля (Exceptional или Exceeds) и числом оценок ниже нуля (Needs Improvement или PIP)Наблюдения:
SOURCE SCORE
:SOURCE SCORE
:production_staff
¶Сотрудники отдела production
Предпросмотр исходных данных
prod_staff_preview = read_sql("SELECT * FROM production_staff LIMIT 5;")
prod_staff_preview.T
0 | 1 | 2 | 3 | 4 | |
---|---|---|---|---|---|
id | 1 | 2 | 3 | 4 | 5 |
Employee Name | Albert, Michael | Bozzi, Charles | Butler, Webster L | Dunn, Amy | Gray, Elijiah |
Race Desc | White | Asian | White | White | White |
Date of Hire | 2011-08-01 | 2013-09-30 | 2016-01-28 | 2014-09-18 | 2015-06-02 |
TermDate | None | 2014-08-07 | None | None | None |
Reason for Term | N/A - still employed | retiring | N/A - still employed | N/A - still employed | N/A - still employed |
Employment Status | Active | Voluntarily Terminated | Active | Active | Active |
Department | Production | Production | Production | Production | Production |
Position | Production Manager | Production Manager | Production Manager | Production Manager | Production Manager |
Pay | $54.50 | $50.50 | $55.00 | $51.00 | $54.00 |
Manager Name | Elisa Bramante | Elisa Bramante | Elisa Bramante | Elisa Bramante | Elisa Bramante |
Performance Score | Fully Meets | Fully Meets | Exceeds | Fully Meets | Fully Meets |
Abutments/Hour Wk 1 | 0 | 0 | 0 | 0 | 0 |
Abutments/Hour Wk 2 | 0 | 0 | 0 | 0 | 0 |
Daily Error Rate | 0 | 0 | 0 | 0 | 0 |
90-day Complaints | 0 | 0 | 0 | 0 | 0 |
Описание данных
get_column_stats('production_staff')
table_name | column_name | data_type | unique | common_val | common_freq | non_null | count | size_bytes | entropy | |
---|---|---|---|---|---|---|---|---|---|---|
0 | production_staff | id | integer | 256 | 184 | 0.003906 | 256 | 256 | 1024 | 8.000000 |
1 | production_staff | Employee Name | character varying | 209 | None | 0.183594 | 209 | 256 | 3394 | 6.980212 |
2 | production_staff | Race Desc | character varying | 6 | White | 0.546875 | 209 | 256 | 2084 | 1.931321 |
3 | production_staff | Date of Hire | date | 70 | None | 0.183594 | 209 | 256 | 836 | 5.284105 |
4 | production_staff | TermDate | date | 76 | None | 0.675781 | 83 | 256 | 332 | 2.921130 |
5 | production_staff | Reason for Term | character varying | 17 | N/A - still employed | 0.460938 | 209 | 256 | 3771 | 2.764453 |
6 | production_staff | Employment Status | character varying | 5 | Active | 0.417969 | 209 | 256 | 2933 | 2.001493 |
7 | production_staff | Department | character varying | 1 | Production | 0.816406 | 209 | 256 | 3762 | 0.687876 |
8 | production_staff | Position | character varying | 3 | Production Technician I | 0.535156 | 209 | 256 | 5004 | 1.646242 |
9 | production_staff | Pay | money | 44 | None | 0.183594 | 209 | 256 | 1672 | 4.459380 |
10 | production_staff | Manager Name | character varying | 10 | None | 0.183594 | 209 | 256 | 2899 | 3.391169 |
11 | production_staff | Performance Score | character varying | 7 | Fully Meets | 0.476562 | 209 | 256 | 2773 | 2.290690 |
12 | production_staff | Abutments/Hour Wk 1 | integer | 18 | None | 0.187500 | 208 | 256 | 832 | 3.891610 |
13 | production_staff | Abutments/Hour Wk 2 | integer | 20 | None | 0.187500 | 208 | 256 | 832 | 3.861694 |
14 | production_staff | Daily Error Rate | integer | 5 | 0 | 0.375000 | 208 | 256 | 832 | 2.192483 |
15 | production_staff | 90-day Complaints | integer | 5 | 0 | 0.640625 | 208 | 256 | 832 | 1.533686 |
Описание полей таблицы production_staff
Field | Type | Description |
---|---|---|
id | Integer | ID строки |
Employee Name | Text | Фамилия, имя работника, разделенные запятой. Пример: Howard, Estelle |
Race Desc | Text | Расовая принадлежность: |
- American Indian or Alaska Native – Индеец или уроженец Аляски | ||
- Hispanic – Латиноамериканец | ||
- Two or more races – Две и более расы | ||
- Asian – Азиат | ||
- White – Европеец | ||
- Black or African American – Темнокожий или афроамериканец | ||
Date of Hire | Date | Дата приема на работу |
TermDate | Date | Дата увольнения |
Reason for Term | Text | Причина увольнения: |
- N/A - still employed – Не уволен | ||
- N/A - Has not started yet – Еще не приступил к работе | ||
- attendance – Посещаемость | ||
- more money – Недостаточная зарплата | ||
- medical issues – Проблемы со здоровьем | ||
- maternity leave - did not return – Декретный отпуск - не вернулся | ||
- return to school – Вернулся в школу | ||
- relocation out of area – Переезд | ||
- performance – Производительность труда | ||
- gross misconduct – Грубое нарушение | ||
- military – Воинская обязанность | ||
- hours – Неудобные рабочие часы | ||
- career change – Смена карьеры | ||
- unhappy – Неудовлетворенность | ||
- retiring – Уход на пенсию | ||
- Another position – Смена позиции в компании | ||
- no-call, no-show – Не отвечал на звонки, не явился на работу | ||
Employment Status | Text | Статус трудовых отношений: |
- Future Start – Испытательный срок | ||
- Voluntarily Terminated – Уволен по собственному желанию | ||
- Active – Действующий сотрудник | ||
- Terminated for Cause – Уволен по какой-либо причине | ||
- Leave of Absence – Отпуск | ||
Department | Text | Название отдела: None | Production |
Position | Text | Название должности: Production Technician I | None | Production Manager | Production Technician II |
Pay | Numeric | Размер тарифной ставки |
Manager Name | Text | Имя менеджера. Пример: Michael Albert |
Performance Score | Text | Оценка труда: |
- N/A- too early to review – Недостаточный срок для оценки | ||
- Exceptional – Исключительная | ||
- Exceeds – Превосходная | ||
- Fully Meets – Удовлетворительная | ||
- 90-day meets – Прошел испытательный срок | ||
- Needs Improvement – Неудовлетворительная | ||
- PIP – Performance Improvement Plan / План индивидуального развития | ||
Abutments/Hour Wk 1 | Integer | Опор* в час, на 1-й неделе. |
Abutments/Hour Wk 2 | Integer | Опор* в час, на 2-й неделе. |
Daily Error Rate | Integer | Ошибок в день |
90-day Complaints | Integer | Жалоб за последние 90 дней |
* Abutment (опора) – название детали для мостовидного зубного протеза
# Подготовка датафрейма production_staff
SQL_PROD_STAFF_SUBQUERY = """
SELECT
--categorical
"Employment Status" AS emp_status
, "Department" AS department
, "Race Desc" AS race_desc
, "Reason for Term" AS term_reason
, "Position" AS position
, "Manager Name" AS manager_name
, "Performance Score" AS perf_desc
--binary
, ("TermDate" IS NOT NULL)::int AS is_termd
, ("Employment Status" = 'Terminated for Cause')::int AS is_termd_cause
--ordinal
, (CASE
WHEN "Performance Score" = 'Exceptional' THEN 2
WHEN "Performance Score" = 'Exceeds' THEN 1
WHEN "Performance Score" = 'Needs Improvement' THEN -1
WHEN "Performance Score" = 'PIP' THEN -2
ELSE 0
END) AS perf_score
--quantitative
, "Pay"::decimal AS pay_rate
, ("Abutments/Hour Wk 1" + "Abutments/Hour Wk 2") / 2 AS prod_rate
, "Daily Error Rate" AS err_rate
, "90-day Complaints" AS cmpln_rate
--datetime
, "Date of Hire" AS hire_date
--key
, id
FROM production_staff
WHERE "Employee Name" IS NOT NULL
"""
Описание добавленных полей:
Abutments/Hour Wk 1
и Abutments/Hour Wk 2
.'Exceptional' = 2
'Exceeds' = 1
'Needs Improvement' = -1
'PIP' = -2
Other = 0
prod_numerical_features = read_sql(f"""
SELECT perf_score, pay_rate, prod_rate, err_rate, cmpln_rate
FROM ({SQL_PROD_STAFF_SUBQUERY}) prod
WHERE position IN ('Production Technician I', 'Production Technician II')
""")
pd.plotting.scatter_matrix(prod_numerical_features, figsize=(15, 15))
pass
Наблюдения:
err_rate
$\uparrow\mathrel{\mspace{-1mu}}\downarrow$ prod_rate
: обратно пропорциональная зависимость между числом ошибок и темпом производства.cmpln_rate
$\uparrow\mathrel{\mspace{-1mu}}\downarrow$ prod_rate
: обратно пропорциональная зависимость между числом жалоб и темпом производства.perf_score
$\uparrow\uparrow$ prod_rate
: прямо пропорциональная зависимость между оценкой производительности труда и темпом производства.Действующие сотрудники
corr_columns = ['is_termd', 'is_termd_cause', 'perf_score', 'pay_rate',
'prod_rate', 'err_rate', 'cmpln_rate']
prod_corr_non_termd = get_corr_matrix(corr_columns, subquery=SQL_PROD_STAFF_SUBQUERY,
where="is_termd = 0 AND position IN ('Production Technician I', 'Production Technician II')",
corr_rnd=True, seed=0)
prod_corr_non_termd.style.background_gradient(cmap='coolwarm', axis=None).format('{:.2f}')
perf_score | pay_rate | prod_rate | err_rate | cmpln_rate | __RND__ | |
---|---|---|---|---|---|---|
perf_score | 1.00 | 0.01 | 0.33 | -0.22 | -0.20 | 0.25 |
pay_rate | 0.01 | 1.00 | -0.08 | 0.05 | 0.12 | -0.02 |
prod_rate | 0.33 | -0.08 | 1.00 | -0.60 | -0.43 | 0.09 |
err_rate | -0.22 | 0.05 | -0.60 | 1.00 | 0.46 | -0.16 |
cmpln_rate | -0.20 | 0.12 | -0.43 | 0.46 | 1.00 | -0.17 |
__RND__ | 0.25 | -0.02 | 0.09 | -0.16 | -0.17 | 1.00 |
Наблюдения:
pay_rate
: ставка не имеет значительной корреляции с другими показателями.perf_score
$\uparrow\uparrow$ prod_rate
(0.33): у сотрудников с более высокой оценкой производительности, темп производства выше.prod_rate
$\uparrow\mathrel{\mspace{-1mu}}\downarrow$ err_rate
(-0.60): сотрудники с высоким темпом производства, имеют меньше ошибок.prod_rate
$\uparrow\mathrel{\mspace{-1mu}}\downarrow$ cmpln_rate
(-0.43): сотрудники с высоким темпом производства, имеют меньше жалоб.err_rate
$\uparrow\uparrow$ cmpln_rate
(0.46): сотрудники с бОльшим количеством ошибок, имеют больше жалоб.Уволенные и уволившиеся сотрудники
prod_corr_non_termd = get_corr_matrix(corr_columns, subquery=SQL_PROD_STAFF_SUBQUERY,
where="is_termd = 1 AND position IN ('Production Technician I', 'Production Technician II')",
corr_rnd=True, seed=0)
prod_corr_non_termd.style.background_gradient(cmap='coolwarm', axis=None).format('{:.2f}')
is_termd_cause | perf_score | pay_rate | prod_rate | err_rate | cmpln_rate | __RND__ | |
---|---|---|---|---|---|---|---|
is_termd_cause | 1.00 | -0.25 | -0.30 | -0.26 | 0.15 | 0.20 | 0.01 |
perf_score | -0.25 | 1.00 | 0.06 | 0.17 | -0.28 | -0.26 | 0.10 |
pay_rate | -0.30 | 0.06 | 1.00 | 0.25 | -0.16 | -0.06 | -0.03 |
prod_rate | -0.26 | 0.17 | 0.25 | 1.00 | -0.70 | -0.52 | 0.01 |
err_rate | 0.15 | -0.28 | -0.16 | -0.70 | 1.00 | 0.57 | 0.04 |
cmpln_rate | 0.20 | -0.26 | -0.06 | -0.52 | 0.57 | 1.00 | 0.10 |
__RND__ | 0.01 | 0.10 | -0.03 | 0.01 | 0.04 | 0.10 | 1.00 |
Наблюдения:
perf_score
)pay_rate
)prod_rate
)Terminated for Cause
) характеризуются:err_rate
)cmpln_rate
)Сотрудники уволенные за нарушение
prod_corr_non_termd = get_corr_matrix(corr_columns, subquery=SQL_PROD_STAFF_SUBQUERY,
where="is_termd = 1 AND is_termd_cause = 1 AND position IN ('Production Technician I', 'Production Technician II')",
corr_rnd=True, seed=0)
prod_corr_non_termd.style.background_gradient(cmap='coolwarm', axis=None).format('{:.2f}')
perf_score | pay_rate | prod_rate | err_rate | cmpln_rate | __RND__ | |
---|---|---|---|---|---|---|
perf_score | 1.00 | 0.38 | -0.06 | -0.51 | -0.78 | -0.29 |
pay_rate | 0.38 | 1.00 | -0.28 | 0.11 | 0.08 | -0.24 |
prod_rate | -0.06 | -0.28 | 1.00 | -0.68 | -0.44 | -0.65 |
err_rate | -0.51 | 0.11 | -0.68 | 1.00 | 0.77 | 0.43 |
cmpln_rate | -0.78 | 0.08 | -0.44 | 0.77 | 1.00 | 0.34 |
__RND__ | -0.29 | -0.24 | -0.65 | 0.43 | 0.34 | 1.00 |
Наблюдения:
__RND__
являются индикатором дефицита данных (7 строк в выборке).Средние показатели сотрудников под руководством менеджера
managers_stats = read_sql(f"""
WITH stats_by_manager AS (
SELECT
manager_name
, count(*) AS group_count
, avg(pay_rate) AS avg_pay_rate
, avg(prod_rate) AS avg_prod_rate
, avg(err_rate) AS avg_err_rate
, avg(cmpln_rate) AS avg_cmpln_rate
, avg(perf_score) AS avg_perf_score
FROM ({SQL_PROD_STAFF_SUBQUERY}) AS prod_staff
WHERE position IN ('Production Technician I', 'Production Technician II') AND is_termd = 0
GROUP BY manager_name
)
, stats_normalized AS (
SELECT
manager_name || ' (' || group_count || ')' AS manager_name
, (avg_prod_rate - min(avg_prod_rate ) OVER ()) * 0.9 / (max(avg_prod_rate ) OVER () - min(avg_prod_rate ) OVER ()) + 0.1 AS "Prod Rate"
, (avg_perf_score - min(avg_perf_score) OVER ()) * 0.9 / (max(avg_perf_score) OVER () - min(avg_perf_score) OVER ()) + 0.1 AS "Performance"
, (avg_pay_rate - min(avg_pay_rate ) OVER ()) * 0.9 / (max(avg_pay_rate ) OVER () - min(avg_pay_rate ) OVER ()) + 0.1 AS "Pay Rate"
, -(avg_err_rate - min(avg_err_rate ) OVER ()) * 0.9 / (max(avg_err_rate ) OVER () - min(avg_err_rate ) OVER ()) - 0.1 AS "Error Rate"
, -(avg_cmpln_rate - min(avg_cmpln_rate) OVER ()) * 0.9 / (max(avg_cmpln_rate) OVER () - min(avg_cmpln_rate) OVER ()) - 0.1 AS "Compaints Rate"
, -(group_count::real - min(group_count ) OVER ()) * 0.9 / (max(group_count ) OVER () - min(group_count ) OVER ()) - 0.1 AS "Count"
FROM stats_by_manager
)
SELECT *
FROM stats_normalized
ORDER BY "Prod Rate" DESC, "Error Rate" DESC, "Compaints Rate" DESC
""")
managers_stats.set_index('manager_name').plot.bar(figsize=(5,5), stacked=True)
plt.xticks(rotation=45, ha='right')
plt.xlabel('')
plt.title('Средние показатели сотрудников под руководством менеджера')
plt.legend(loc='center left', bbox_to_anchor=(1, 0.5))
plt.box(False)
pass
Комментарий к диаграмме:
Наблюдения:
Структура персонала
current_grouping = read_sql(f"""
WITH tech_staff AS (
SELECT raw."Employee Name" AS emp_name, prod_staff.*
FROM ({SQL_PROD_STAFF_SUBQUERY}) AS prod_staff
JOIN production_staff AS raw USING (id)
WHERE "Position" IN ('Production Technician I', 'Production Technician II')
AND prod_rate IS NOT NULL
AND is_termd = 0
)
, managers AS (
SELECT manager_name
FROM tech_staff
GROUP BY manager_name
ORDER BY avg(prod_rate) DESC
)
SELECT *
FROM managers
CROSS JOIN lateral (
SELECT emp_name, prod_rate, err_rate, cmpln_rate
FROM tech_staff AS subgroup
WHERE subgroup.manager_name = managers.manager_name
ORDER BY prod_rate
) AS low__prod_rate__group
""")
plt.figure(figsize=(15, 5))
ax = sns.swarmplot('manager_name', 'prod_rate', data=current_grouping)
ax.set(xlabel='Manager', ylabel='Production Rate, abutments/hour', title='Employee Groups')
plt.box(False)
pass
plt.figure(figsize=(15,5))
ax = sns.boxplot('manager_name', 'prod_rate', data=current_grouping)
ax.set(xlabel='Manager', ylabel='Production Rate, abutments/hour', title='Employee Groups')
plt.box(False)
Наблюдения:
Подобные нетипичные случаи, стоящие "особняком" называются outliers (выбросы). Следующий запрос вычисляет допустимые границы и ищет выбросы по каждой группе.
emp_outliers = read_sql(f"""
WITH tech_staff AS (
SELECT raw."Employee Name" AS emp_name, prod_staff.*
FROM ({SQL_PROD_STAFF_SUBQUERY}) AS prod_staff
JOIN production_staff AS raw USING (id)
WHERE "Position" IN ('Production Technician I', 'Production Technician II')
AND prod_rate IS NOT NULL
AND is_termd = 0
)
, managers AS (
SELECT manager_name,
2.5 * percentile_cont(0.25) WITHIN GROUP (ORDER BY prod_rate)
- 1.5 * percentile_cont(0.75) WITHIN GROUP (ORDER BY prod_rate) AS lower_bound,
2.5 * percentile_cont(0.75) WITHIN GROUP (ORDER BY prod_rate)
- 1.5 * percentile_cont(0.25) WITHIN GROUP (ORDER BY prod_rate) AS upper_bound
FROM tech_staff
GROUP BY manager_name
)
SELECT manager_name, emp_name, prod_rate, 'Below' AS level
FROM managers
JOIN tech_staff USING (manager_name)
WHERE prod_rate < lower_bound
UNION ALL
SELECT manager_name, emp_name, prod_rate, 'Above' AS level
FROM managers
JOIN tech_staff USING (manager_name)
WHERE prod_rate > upper_bound
ORDER BY manager_name
""")
emp_outliers
manager_name | emp_name | prod_rate | level | |
---|---|---|---|---|
0 | Elijiah Gray | Faller, Megan | 16 | Above |
1 | Elijiah Gray | Smith, Joe | 15 | Above |
2 | Kissy Sullivan | Garneau, Hamish | 8 | Below |
3 | Kissy Sullivan | Bugali, Josephine | 14 | Above |
4 | Kissy Sullivan | Davis, Daniel | 14 | Above |
recruiting_costs
¶Затраты на подбор персонала
Предпросмотр исходных данных
recr_costs_preview = read_sql("SELECT * FROM recruiting_costs LIMIT 50;", index_col='id')
recr_costs_preview
Employment Source | January | February | March | April | May | June | July | August | September | October | November | December | Total | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
id | ||||||||||||||
1 | Billboard | 520 | 520 | 520 | 520 | 0 | 0 | 612 | 612 | 729 | 749 | 910 | 500 | 6192 |
2 | Careerbuilder | 410 | 410 | 410 | 820 | 820 | 410 | 410 | 820 | 820 | 1230 | 820 | 410 | 7790 |
3 | Company Intranet - Partner | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
4 | Diversity Job Fair | 0 | 5129 | 0 | 0 | 0 | 0 | 0 | 4892 | 0 | 0 | 0 | 0 | 10021 |
5 | Employee Referral | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
6 | Glassdoor | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
7 | Information Session | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
8 | Internet Search | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
9 | MBTA ads | 640 | 640 | 640 | 640 | 640 | 640 | 640 | 1300 | 1300 | 1300 | 1300 | 1300 | 10980 |
10 | Monster.com | 500 | 500 | 500 | 440 | 500 | 500 | 440 | 500 | 440 | 440 | 500 | 500 | 5760 |
11 | Newspager/Magazine | 629 | 510 | 293 | 810 | 642 | 675 | 707 | 740 | 772 | 805 | 838 | 870 | 8291 |
12 | On-campus Recruiting | 0 | 0 | 2500 | 0 | 0 | 2500 | 0 | 0 | 2500 | 0 | 0 | 0 | 7500 |
13 | On-line Web application | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
14 | Other | 0 | 492 | 0 | 829 | 744 | 0 | 610 | 0 | 0 | 510 | 0 | 810 | 3995 |
15 | Pay Per Click | 110 | 110 | 60 | 121 | 110 | 109 | 130 | 146 | 105 | 109 | 105 | 110 | 1323 |
16 | Pay Per Click - Google | 330 | 330 | 180 | 362 | 197 | 152 | 389 | 437 | 315 | 327 | 315 | 176 | 3509 |
17 | Professional Society | 100 | 100 | 100 | 100 | 100 | 100 | 100 | 100 | 100 | 100 | 100 | 100 | 1200 |
18 | Search Engine - Google Bing Yahoo | 330 | 410 | 388 | 372 | 472 | 412 | 416 | 495 | 619 | 502 | 389 | 378 | 5183 |
19 | Social Networks - Facebook Twitter etc | 420 | 481 | 452 | 479 | 392 | 508 | 578 | 466 | 389 | 439 | 491 | 478 | 5573 |
20 | Vendor Referral | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
21 | Website Banner Ads | 400 | 400 | 300 | 388 | 592 | 610 | 620 | 669 | 718 | 767 | 816 | 865 | 7143 |
22 | Word of Mouth | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
Описание данных
get_column_stats('recruiting_costs')
table_name | column_name | data_type | unique | common_val | common_freq | non_null | count | size_bytes | entropy | |
---|---|---|---|---|---|---|---|---|---|---|
0 | recruiting_costs | id | integer | 22 | 22 | 0.045455 | 22 | 22 | 88 | 4.459432 |
1 | recruiting_costs | Employment Source | character varying | 22 | Billboard | 0.045455 | 22 | 22 | 405 | 4.459432 |
2 | recruiting_costs | January | integer | 11 | 0 | 0.500000 | 22 | 22 | 88 | 2.638807 |
3 | recruiting_costs | February | integer | 13 | 0 | 0.409091 | 22 | 22 | 88 | 3.071735 |
4 | recruiting_costs | March | integer | 13 | 0 | 0.454545 | 22 | 22 | 88 | 2.949464 |
5 | recruiting_costs | April | integer | 13 | 0 | 0.454545 | 22 | 22 | 88 | 2.949464 |
6 | recruiting_costs | May | integer | 12 | 0 | 0.500000 | 22 | 22 | 88 | 2.729716 |
7 | recruiting_costs | June | integer | 12 | 0 | 0.500000 | 22 | 22 | 88 | 2.729716 |
8 | recruiting_costs | July | integer | 13 | 0 | 0.454545 | 22 | 22 | 88 | 2.949464 |
9 | recruiting_costs | August | integer | 13 | 0 | 0.454545 | 22 | 22 | 88 | 2.949464 |
10 | recruiting_costs | September | integer | 13 | 0 | 0.454545 | 22 | 22 | 88 | 2.949464 |
11 | recruiting_costs | October | integer | 13 | 0 | 0.454545 | 22 | 22 | 88 | 2.949464 |
12 | recruiting_costs | November | integer | 12 | 0 | 0.500000 | 22 | 22 | 88 | 2.729716 |
13 | recruiting_costs | December | integer | 12 | 0 | 0.454545 | 22 | 22 | 88 | 2.858555 |
14 | recruiting_costs | Total | integer | 15 | 0 | 0.363636 | 22 | 22 | 88 | 3.368523 |
Описание полей таблицы recruiting_costs
Field | Type | Description |
---|---|---|
id | Integer | ID строки |
Employee Source | Text | Канал поиска соискателей: |
- Billboard – Рекламный щит | ||
- Employee Referral – Рекомендация сотрудника | ||
- Vendor Referral – Рекомендация продавца | ||
- Word of Mouth – Из уст в уста / Сарафанное радио | ||
- Pay Per Click – Оплата за клик | ||
- Pay Per Click - Google – Оплата за клик - Google | ||
- Social Networks - Facebook Twitter etc – Социальные сети - Facebook, Twitter и т.д. | ||
- Search Engine - Google Bing Yahoo – Поисковая система - Google Bing Yahoo | ||
- Professional Society – Профессиональное сообщество | ||
- Diversity Job Fair – Ярмарка вакансий | ||
- Company Intranet - Partner – Внутренняя сеть компании и партнеров | ||
- On-line Web application – Онлайн веб-приложение | ||
- Newspager/Magazine – Газеты / Журналы | ||
- Website Banner Ads – Баннерная реклама на сайте | ||
- Information Session – Презентация организации | ||
- On-campus Recruiting – Набор в университете / колледже | ||
- Internet Search – Поиск в интернете | ||
- Careerbuilder – www.careerbuilder.com - Сайт для соискателей | ||
- Glassdoor – www.glassdoor.com - Сайт для соискателей | ||
- Monster.com – www.monster.com - Сайт для соискателей | ||
- MBTA ads – www.mbta.com – Реклама на транспорте и остановках (Massachusetts Bay Transportation Authority) | ||
- Other – Другой | ||
January .. December | Integer | Затраты на рекрутинг, за указанный месяц |
Total | Integer | Затраты на рекрутинг, всего |
При запросе к таблице recruiting_costs
можно столкнуться со следующей ошибкой:
SELECT "January"
FROM recruiting_costs
/* output */
...
column "January" does not exist
LINE 2: SELECT "January"
^
HINT: Perhaps you meant to reference the column "recruiting_costs.January ".
Ошибка связана с тем, что идентификатор колонки "January "
содержит пробел на конце. Для дальнейшей работы с данными следует привести идентификаторы колонок к единому формату.
SQL_RECR_COSTS_SUBQUERY = """
SELECT *
FROM (SELECT * FROM recruiting_costs)
AS recr_costs(id, emp_source, jan, feb, mar, apr, may, jun, jul, aug, sep, oct, nov, dec, total)
"""
recr_costs = read_sql(f"""
WITH recr_costs(emp_source, total, jan, feb, mar, apr, may, jun, jul, aug, sep, oct, nov, dec) AS (
SELECT emp_source, total::real / 1000,
jan::real / 1000, feb::real / 1000, mar::real / 1000, apr::real / 1000, may::real / 1000, jun::real / 1000,
jul::real / 1000, aug::real / 1000, sep::real / 1000, oct::real / 1000, nov::real / 1000, dec::real / 1000
FROM ({SQL_RECR_COSTS_SUBQUERY}) AS subquery
)
(
SELECT emp_source, total::real / (SELECT sum(total) FROM recr_costs) * 100 AS "Total, %",
jan, feb, mar, apr, may, jun, jul, aug, sep, oct, nov, dec
FROM recr_costs
ORDER BY "Total, %"
) UNION ALL (
SELECT 'Total (k)', sum(total),
sum(jan), sum(feb), sum(mar), sum(apr), sum(may), sum(jun),
sum(jul), sum(aug), sum(sep), sum(oct), sum(nov), sum(dec)
FROM recr_costs
)
""", index_col='emp_source')
plot_pivot_table(recr_costs, cmap='Purples', title='Расходы на каналы поиска сотрудников, (тыс.)', fmt='.1f')
plt.ylabel('Employee Source')
pass
Наблюдения:
Employee Referral, Vendor Referral, Word of Mouth
) не требуют расходов.Careerbuilder
стоит на 4-м месте по уровню расходов (9.2%), являясь при этом одним из наименее эффективных каналов поиска.MBTA ads
возросли более чем в 2 раза начиная с августа.salary_grid
¶Зарплатная сетка
Предпросмотр исходных данных
salary_preview = read_sql("SELECT * FROM salary_grid LIMIT 50;", index_col='id')
salary_preview
Position | Salary Min | Salary Mid | Salary Max | Hourly Min | Hourly Mid | Hourly Max | |
---|---|---|---|---|---|---|---|
id | |||||||
1 | Administrative Assistant | 30000 | 40000 | 50000 | 14.42 | 19.23 | 24.04 |
2 | Sr. Administrative Assistant | 35000 | 45000 | 55000 | 16.83 | 21.63 | 26.44 |
3 | Accountant I | 42274 | 51425 | 62299 | 20.32 | 24.72 | 29.95 |
4 | Accountant II | 50490 | 62158 | 74658 | 24.27 | 29.88 | 35.89 |
5 | Sr. Accountant | 63264 | 76988 | 92454 | 30.42 | 37.01 | 44.45 |
6 | Network Engineer | 50845 | 66850 | 88279 | 24.44 | 32.14 | 42.44 |
7 | Sr. Network Engineer | 79428 | 99458 | 120451 | 38.19 | 47.82 | 57.91 |
8 | Database Administrator | 50569 | 68306 | 93312 | 24.31 | 32.84 | 44.86 |
9 | Sr. DBA | 92863 | 116007 | 139170 | 44.65 | 55.77 | 66.91 |
10 | Production Technician I | 30000 | 40000 | 50000 | 14.42 | 19.23 | 24.04 |
11 | Production Technician II | 38000 | 48000 | 58000 | 18.27 | 23.08 | 27.88 |
12 | Lead Production Technician | 45000 | 55000 | 65000 | 21.63 | 26.44 | 31.25 |
Описание данных
get_column_stats('salary_grid')
table_name | column_name | data_type | unique | common_val | common_freq | non_null | count | size_bytes | entropy | |
---|---|---|---|---|---|---|---|---|---|---|
0 | salary_grid | id | integer | 12 | 5 | 0.083333 | 12 | 12 | 48 | 3.584963 |
1 | salary_grid | Position | character varying | 12 | Sr. DBA | 0.083333 | 12 | 12 | 241 | 3.584963 |
2 | salary_grid | Salary Min | integer | 11 | 30000 | 0.166667 | 12 | 12 | 48 | 3.418296 |
3 | salary_grid | Salary Mid | integer | 11 | 40000 | 0.166667 | 12 | 12 | 48 | 3.418296 |
4 | salary_grid | Salary Max | integer | 11 | 50000 | 0.166667 | 12 | 12 | 48 | 3.418296 |
5 | salary_grid | Hourly Min | real | 11 | 14.42 | 0.166667 | 12 | 12 | 48 | 3.418296 |
6 | salary_grid | Hourly Mid | real | 11 | 19.23 | 0.166667 | 12 | 12 | 48 | 3.418296 |
7 | salary_grid | Hourly Max | real | 11 | 24.04 | 0.166667 | 12 | 12 | 48 | 3.418296 |
Описание полей таблицы salary_grid
Field | Type | Description |
---|---|---|
id | Integer | ID строки |
Position | Text | Позиция в компании. Пример: Production Technician I |
Salary Min | Integer | Годовой оклад, min |
Salary Mid | Integer | Годовой оклад, median |
Salary Max | Integer | Годовой оклад, max |
Hourly Min | Real | Размер почасовой оплаты, min |
Hourly Mid | Real | Размер почасовой оплаты, median |
Hourly Max | Real | Размер почасовой оплаты, max |
Годовой оклад исходя из расчета 40 ч/нед. (2080 ч/год).
SQL_SALARY_SUBQUERY = """
SELECT
"Position" AS position
, "Salary Min" AS annual_min
, "Salary Mid" AS annual_mid
, "Salary Max" AS annual_max
, "Hourly Min" AS hourly_min
, "Hourly Mid" AS hourly_mid
, "Hourly Max" AS hourly_max
FROM salary_grid
"""
sal = read_sql(SQL_SALARY_SUBQUERY)
sal
position | annual_min | annual_mid | annual_max | hourly_min | hourly_mid | hourly_max | |
---|---|---|---|---|---|---|---|
0 | Administrative Assistant | 30000 | 40000 | 50000 | 14.42 | 19.23 | 24.04 |
1 | Sr. Administrative Assistant | 35000 | 45000 | 55000 | 16.83 | 21.63 | 26.44 |
2 | Accountant I | 42274 | 51425 | 62299 | 20.32 | 24.72 | 29.95 |
3 | Accountant II | 50490 | 62158 | 74658 | 24.27 | 29.88 | 35.89 |
4 | Sr. Accountant | 63264 | 76988 | 92454 | 30.42 | 37.01 | 44.45 |
5 | Network Engineer | 50845 | 66850 | 88279 | 24.44 | 32.14 | 42.44 |
6 | Sr. Network Engineer | 79428 | 99458 | 120451 | 38.19 | 47.82 | 57.91 |
7 | Database Administrator | 50569 | 68306 | 93312 | 24.31 | 32.84 | 44.86 |
8 | Sr. DBA | 92863 | 116007 | 139170 | 44.65 | 55.77 | 66.91 |
9 | Production Technician I | 30000 | 40000 | 50000 | 14.42 | 19.23 | 24.04 |
10 | Production Technician II | 38000 | 48000 | 58000 | 18.27 | 23.08 | 27.88 |
11 | Lead Production Technician | 45000 | 55000 | 65000 | 21.63 | 26.44 | 31.25 |
pay_rate_level = read_sql(f"""
SELECT
position AS "Position"
, sum((pay_rate < hourly_min)::int)::real / count(*) * 100 AS "Lower"
, sum((pay_rate BETWEEN hourly_min AND hourly_max)::int)::real / count(*) * 100 AS "Good"
, sum((pay_rate > hourly_max)::int)::real / count(*) * 100 AS "Higher"
FROM ({SQL_HR_DATASET_SUBQUERY}) AS hr_dataset
INNER JOIN ({SQL_SALARY_SUBQUERY}) AS salary USING (position)
GROUP BY "Position"
ORDER BY "Lower" DESC, "Higher"
""", index_col='Position')
ax = pay_rate_level.plot.barh(stacked=True, figsize=(10, 3))
plt.box(False)
plt.title('Доля зарплат ниже/выше границы в рамках одной должности, %')
ax.tick_params(axis='x', which='both', bottom=False, top=False, labelbottom=False)
ax.legend(bbox_to_anchor=(1,.5), loc='center left')
pass
emp_pay_low = read_sql(f"""
SELECT "Employee Name", position AS "Position", "Pay Rate", hourly_min AS "Pay Rate Min",
hourly_min::decimal - "Pay Rate" AS "Underpay"
FROM hr_dataset
INNER JOIN ({SQL_SALARY_SUBQUERY}) AS salary USING (position)
WHERE hourly_min > "Pay Rate"
ORDER BY hourly_min - "Pay Rate" DESC
LIMIT 50
""")
emp_pay_low
# Список сотрудников с зарплатой, выходящей за нижнюю границу зарплатной сетки
Employee Name | Position | Pay Rate | Pay Rate Min | Underpay | |
---|---|---|---|---|---|
0 | Gross, Paula | Production Technician I | 14.0 | 14.42 | 0.42 |
1 | Knapp, Bradley J | Production Technician I | 14.0 | 14.42 | 0.42 |
2 | Meads, Elizabeth | Production Technician I | 14.0 | 14.42 | 0.42 |
3 | Stanford,Barbara M | Production Technician I | 14.0 | 14.42 | 0.42 |
4 | Sutwell, Barbara | Production Technician I | 14.0 | 14.42 | 0.42 |
emp_pay_high = read_sql(f"""
SELECT "Employee Name", position AS "Position", "Pay Rate", hourly_max AS "Pay Rate Max",
"Pay Rate"::decimal - hourly_max AS "Overpay"
FROM hr_dataset
INNER JOIN ({SQL_SALARY_SUBQUERY}) AS salary USING (position)
WHERE hourly_max < "Pay Rate"
ORDER BY "Pay Rate" - hourly_max DESC
LIMIT 50
""")
emp_pay_high
# Список сотрудников с зарплатой, выходящей за верхнюю границу зарплатной сетки
Employee Name | Position | Pay Rate | Pay Rate Max | Overpay | |
---|---|---|---|---|---|
0 | Turpin, Jumil | Network Engineer | 49.10 | 42.44 | 6.660001 |
1 | Shepard, Anita | Network Engineer | 47.00 | 42.44 | 4.560001 |
2 | Goble, Taisha | Database Administrator | 48.50 | 44.86 | 3.639999 |
3 | Bacong, Alejandro | Network Engineer | 45.00 | 42.44 | 2.560001 |
4 | Winthrop, Jordan | Production Technician II | 29.00 | 27.88 | 1.120001 |
5 | Akinkuolie, Sarah | Production Technician II | 29.00 | 27.88 | 1.120001 |
6 | Demita, Carla | Production Technician II | 29.00 | 27.88 | 1.120001 |
7 | Fitzpatrick, Michael J | Production Technician II | 29.00 | 27.88 | 1.120001 |
8 | Gonzalez, Juan | Production Technician II | 29.00 | 27.88 | 1.120001 |
9 | Moumanil, Maliki | Production Technician II | 29.00 | 27.88 | 1.120001 |
10 | Peters, Lauren | Production Technician II | 29.00 | 27.88 | 1.120001 |
11 | Sahoo, Adil | Production Technician II | 29.00 | 27.88 | 1.120001 |
12 | Tejeda, Lenora | Production Technician II | 28.75 | 27.88 | 0.870001 |
13 | Clukey, Elijian | Production Technician I | 24.75 | 24.04 | 0.709999 |
14 | Merlos, Carlos | Network Engineer | 43.00 | 42.44 | 0.560001 |
15 | Smith, Sade | Production Technician I | 24.50 | 24.04 | 0.459999 |
16 | Salter, Jason | Database Administrator | 45.00 | 44.86 | 0.139999 |
17 | Homberger, Adrienne J | Production Technician II | 28.00 | 27.88 | 0.120001 |
18 | Latif, Mohammed | Production Technician II | 28.00 | 27.88 | 0.120001 |
19 | Pelletier, Ermine | Production Technician II | 28.00 | 27.88 | 0.120001 |
20 | Hutter, Rosalie | Production Technician II | 28.00 | 27.88 | 0.120001 |
Вот несколько утверждений из интервью 2008 года \"Thomas Sowell - Gender Bias and Income Disparity: A Myth?\" (youtube.com), взятого у американского экономиста Томаса Соуэлла:
... the asymmetrical division of domestic responsibilities produces male-female differences in incomes ...
— Economic Facts and Fallacies: Second Edition By Thomas Sowell
Замечание. Поскольку в датасете нет информации о количестве отработанных каждым сотрудником часов, нельзя непосредственно проверить гипотезу о доходах (income), поэтому будут проверены сопутствующие гипотезы.
Гипотеза 1. Производительность (performance score) женщин, не состоявших в браке, выше.
Вторая гипотеза была выдвинута, в том числе, на основании предыдущих результатов анализа корреляционных матриц, которые обнаружили связь между показателями pay_rate
$\uparrow\uparrow$ is_male
(0.13) среди действующих сотрудников.
Гипотеза 2. Зависимость между полом сотрудника и размером ставки является следствием гендерной дискриминации в данной компании.
Анализ производительности в зависимости от пола и семейного положения
def get_single_perf(is_male=0):
return read_sql(f"""
WITH hr_subset AS ({SQL_HR_DATASET_SUBQUERY})
, single_male_perf_count AS (
SELECT
count(*) AS cnt
, CASE WHEN marital_desc = 'Single' THEN 'Never Married' ELSE 'Was Married' END AS was_married
, perf_level
FROM hr_subset
WHERE is_male = %(is_male)s
GROUP BY was_married, perf_level)
, perf_pct(pct, was_married, perf_level) AS (
SELECT cnt / sum(cnt) OVER (PARTITION BY was_married), was_married, perf_level FROM single_male_perf_count)
SELECT was_married, low, average, high
FROM (SELECT * FROM perf_pct WHERE perf_level = 'low') AS t1(low, was_married)
JOIN (SELECT * FROM perf_pct WHERE perf_level = 'average') AS t2(average, was_married) USING (was_married)
JOIN (SELECT * FROM perf_pct WHERE perf_level = 'high') AS t3(high, was_married) USING (was_married)
""",
params={'is_male': is_male}, index_col='was_married')
df_single_female_perf = get_single_perf(is_male=0)
df_single_male_perf = get_single_perf(is_male=1)
curr_palet = 'Paired'
fig, axes = plt.subplots(2, figsize=(14, 4))
with sns.color_palette(curr_palet, 3):
df_single_male_perf.plot.barh(stacked=True, ax=axes[0], title='Male')
df_single_female_perf.plot.barh(stacked=True, ax=axes[1], title='Female')
for ax in axes:
ax.set_ylabel('')
ax.set_frame_on(False)
ax.spines['bottom'].set_visible(False)
ax.tick_params(axis='x', which='both', bottom=False, top=False, labelbottom=False)
ax.legend(title='Performance', bbox_to_anchor=(1,.5), loc='center left')
Наблюдение:
Гипотеза 1 – подтверждена. Женщины не состоявшие в браке имеют более высокую оценку производительности труда.
Анализ показателя Pay Rate
(ставка)
def get_mid_pay_rate(positions=None):
""" >>> get_mid_pay_rate(('Production Technician I',))
...
>>> get_mid_pay_rate(('Production Technician I', 'Production Technician II'))
...
"""
return read_sql(f"""
WITH mid_pay_rate AS (
SELECT
'Median Pay Rate' AS row_name
, gender_desc AS column_name
, percentile_cont(0.5) WITHIN GROUP (ORDER BY pay_rate) AS value
FROM ({SQL_HR_DATASET_SUBQUERY}) AS subq_1
{'--' if positions is None else ''} WHERE position IN %s
GROUP BY gender_desc
)
SELECT
female.value AS "Female"
, male.value AS "Male"
FROM (SELECT * FROM mid_pay_rate WHERE column_name = 'Female') female
JOIN (SELECT * FROM mid_pay_rate WHERE column_name = 'Male') male USING (row_name)
""", params=[positions])
df_mid_pay_rate = get_mid_pay_rate()
with sns.color_palette(curr_palet):
ax = df_mid_pay_rate.plot.bar(figsize=(2, 4))
plt.box(False)
ax.set_xlabel('Pay Rate Median')
ax.tick_params(axis='x', which='both', bottom=False, top=False, labelbottom=False)
ax.legend(bbox_to_anchor=(1,.5), loc='center left')
pass
Наблюдение:
Ставка и численность сотрудников на различных должностях
df_position_gender__pay_rate = read_sql(f"""
SELECT
position
, gender_desc
, percentile_cont(0.5) WITHIN GROUP (ORDER BY pay_rate) AS median
FROM ({SQL_HR_DATASET_SUBQUERY}) AS subq_1
GROUP BY position, gender_desc
ORDER BY count(*) DESC, position
""")
df_position_gender_dist = read_sql(f"""
SELECT
position
, gender_desc
, count(*)
FROM ({SQL_HR_DATASET_SUBQUERY}) AS subq_1
GROUP BY position, gender_desc
ORDER BY count(*) DESC, position
""")
fig, [ax_0, ax_1] = plt.subplots(ncols=2, sharey=True, figsize=(13,13))
sns.barplot('median', 'position', 'gender_desc', data=df_position_gender__pay_rate, ax=ax_0, palette=curr_palet)
ax_0.set_frame_on(False)
ax_0.set_title('Pay Rate Median', fontsize='18')
ax_0.set_xlabel('pay rate')
ax_0.legend(loc='center right')
sns.barplot('count', 'position', 'gender_desc', data=df_position_gender_dist, ax=ax_1, palette=curr_palet)
ax_1.set_frame_on(False)
ax_1.set_title('Count', fontsize='18')
ax_1.set_ylabel('')
ax_1.legend(loc='center right').remove()
pass
Наблюдения:
Чтобы сделать вывод по 2-й гипотезе (о размере ставки) необходимо рассмотреть размер ставки в рамках конкретной должности.
Ставка сотрудников на технических и управляющих должностях
# Медианная ставка сотрудников на технических должностях
df_mid_pay_rate_prod = get_mid_pay_rate(('Production Technician I', 'Production Technician II'))
# Медианная ставка сотрудников на управляющих должностях
df_mid_pay_rate_mgr = get_mid_pay_rate(('Area Sales Manager', 'Production Manager'))
with sns.color_palette(curr_palet):
fig, [ax_0, ax_1] = plt.subplots(nrows=2, figsize=(10, 4), sharex=True)
df_mid_pay_rate_prod.plot.barh(title='Production Technician I/II Pay Rate Median', ax=ax_0)
ax_0.set_frame_on(False)
ax_0.tick_params(axis='x', which='both', bottom=False, top=False, labelbottom=False)
ax_0.tick_params(axis='y', which='both', left=False, right=False, labelleft=False)
df_mid_pay_rate_mgr.plot.barh(title='Manager Pay Rate Median', ax=ax_1)
ax_1.set_frame_on(False)
ax_1.legend().remove()
plt.xlabel('Pay Rate Median')
ax_1.tick_params(axis='y', which='both', left=False, right=False, labelleft=False)
Наблюдения:
Гипотеза 2 – опровергнута. Зависимость между полом сотрудника и размером ставки не является следствием гендерной дискриминации и объясняется лишь неравнозначностью занимаемых должностей.
Согласно результатам анализа, рекомендуемые (с учетом рейтинга) каналы поиска сотрудников:
На позицию Production Technician I:
На позицию Production Technician II:
На позицию Area Sales Manager / Production Manager:
На позицию Database Administrator / Software Engineer / (Sr.) Network Engineer:
На позицию Data Architect / (Senior) BI Developer:
На позицию Manager (lower) / Accountant / Support / Assistant:
На позицию Manager (middle):
На позицию Manager (top) / Director:
Согласно результатам анализа имеют больший размер ставки:
Согласно результатам анализа имеют больший размер ставки:
Раса | Позиции |
---|---|
Азиаты | Database Administrator / Software Engineer / Sr. Network Engineer |
Европейцы | Accountant I / IT Support / Sales Manager / Sr. DBA |
Латиноамериканцы | Production Manager |
Индейцы или уроженцы Аляски | Area Sales Manager |
Темнокожие или афроамериканцы | Production Technician I/II |
Согласно результатам анализа сотрудники, никогда не состоявшие в браке, имеют более высокую оценку производительности труда.
Согласно результатам анализа увольнения за нарушения чаще происходили среди более молодых сотрудников.
Равномерное распределение
def new_sql_staff_groups(evenly=True):
return f"""
WITH tech_staff AS (
SELECT raw."Employee Name" AS emp_name, prod_staff.*
FROM ({SQL_PROD_STAFF_SUBQUERY}) AS prod_staff
JOIN production_staff AS raw USING (id)
WHERE "Position" IN ('Production Technician I', 'Production Technician II')
AND prod_rate IS NOT NULL
AND is_termd = 0
)
, managers AS (
SELECT manager_name, row_number() OVER (ORDER BY avg(prod_rate) DESC) - 1 AS manager_pos
FROM tech_staff
GROUP BY manager_name
ORDER BY avg(prod_rate) DESC
)
, subordinates AS (
SELECT emp_name, prod_rate, row_number() OVER (ORDER BY prod_rate DESC) - 1 AS emp_pos
FROM tech_staff
ORDER BY prod_rate DESC
)
SELECT *
FROM managers
CROSS JOIN lateral (
SELECT emp_name, prod_rate
FROM subordinates AS subgroup
WHERE subgroup.emp_pos {'%' if evenly else '/'} 9 = managers.manager_pos
ORDER BY prod_rate
) AS employees
"""
grouping_1 = read_sql(new_sql_staff_groups(evenly=True))
plt.figure(figsize=(15, 5))
ax = sns.swarmplot('manager_name', 'prod_rate', data=grouping_1)
ax.set(xlabel='Manager', ylabel='Production Rate, abutments/hour', title='Employee Groups')
plt.box(False)
pass
plt.figure(figsize=(15,5))
ax = sns.boxplot('manager_name', 'prod_rate', data=grouping_1)
ax.set(xlabel='Manager', ylabel='Production Rate, abutments/hour', title='Employee Groups')
plt.box(False)
Деление на классы
grouping_2 = read_sql(new_sql_staff_groups(evenly=False))
plt.figure(figsize=(15, 5))
ax = sns.swarmplot('manager_name', 'prod_rate', data=grouping_2)
ax.set(xlabel='Manager', ylabel='Production Rate, abutments/hour', title='Employee Groups')
plt.box(False)
pass
Комбинированный подход. Два лидера
grouping_3 = read_sql(f"""
WITH tech_staff AS (
SELECT
row_number() OVER (ORDER BY prod_rate DESC) - 1 AS emp_pos
, raw."Employee Name" AS emp_name
, prod_staff.*
FROM ({SQL_PROD_STAFF_SUBQUERY}) AS prod_staff
JOIN production_staff AS raw USING (id)
WHERE "Position" IN ('Production Technician I', 'Production Technician II')
AND prod_rate IS NOT NULL
AND is_termd = 0
)
, managers AS (
SELECT manager_name, row_number() OVER (ORDER BY avg(prod_rate) DESC) - 1 AS group_id
FROM tech_staff
GROUP BY manager_name
ORDER BY avg(prod_rate) DESC
)
, group_1 AS (
SELECT emp_name, prod_rate, (row_number() OVER (ORDER BY prod_rate DESC) - 1) % 2 AS group_id
FROM tech_staff
WHERE emp_pos / 9 < 2
)
, group_2 AS (
SELECT emp_name, prod_rate, (row_number() OVER (ORDER BY prod_rate DESC) - 1) % 7 + 2 AS group_id
FROM tech_staff
WHERE emp_pos / 9 >= 2
)
SELECT *
FROM managers
CROSS JOIN lateral (
SELECT emp_name, prod_rate
FROM group_1
WHERE group_1.group_id = managers.group_id
UNION ALL
SELECT emp_name, prod_rate
FROM group_2
WHERE group_2.group_id = managers.group_id
ORDER BY prod_rate DESC
) AS employees
""")
plt.figure(figsize=(15, 5))
ax = sns.swarmplot('manager_name', 'prod_rate', data=grouping_3)
ax.set(xlabel='Manager', ylabel='Production Rate, abutments/hour', title='Employee Groups')
plt.box(False)
pass
plt.figure(figsize=(15,5))
ax = sns.boxplot('manager_name', 'prod_rate', data=grouping_3)
ax.set(xlabel='Manager', ylabel='Production Rate, abutments/hour', title='Employee Groups')
plt.box(False)
with pd.option_context('display.max_rows', 200):
display(grouping_3[['emp_name', 'manager_name']])
# Схема реорганизации структуры персонала согласно стратегии 3
emp_name | manager_name | |
---|---|---|
0 | Anderson, Linda | Kissy Sullivan |
1 | Girifalco, Evelyn | Kissy Sullivan |
2 | Walker, Roger | Kissy Sullivan |
3 | Harrison, Kara | Kissy Sullivan |
4 | Osturnka, Adeel | Kissy Sullivan |
5 | Adinolfi, Wilson K | Kissy Sullivan |
6 | Smith, Joe | Kissy Sullivan |
7 | Billis, Helen | Kissy Sullivan |
8 | Trang, Mei | Kissy Sullivan |
9 | Jeannite, Tayana | Amy Dunn |
10 | Monkfish, Erasumus | Amy Dunn |
11 | Faller, Megan | Amy Dunn |
12 | Clukey, Elijian | Amy Dunn |
13 | Gonzalez, Cayo | Amy Dunn |
14 | Sahoo, Adil | Amy Dunn |
15 | Ngodup, Shari | Amy Dunn |
16 | Hankard, Earnest | Amy Dunn |
17 | Sparks, Taylor | Amy Dunn |
18 | Fancett, Nicole | Brannon Miller |
19 | Sullivan, Timothy | Brannon Miller |
20 | Ivey, Rose | Brannon Miller |
21 | Biden, Lowan M | Brannon Miller |
22 | Darson, Jene'ya | Brannon Miller |
23 | Handschiegl, Joanne | Brannon Miller |
24 | Barone, Francesco A | Brannon Miller |
25 | Hutter, Rosalie | Brannon Miller |
26 | Mahoney, Lauren | Brannon Miller |
27 | Crimmings, Jean | Brannon Miller |
28 | Heitzman, Anthony | Brannon Miller |
29 | Stoica, Rick | Brannon Miller |
30 | Athwal, Sam | Brannon Miller |
31 | Beak, Kimberly | Brannon Miller |
32 | Bugali, Josephine | David Stanley |
33 | Chan, Lin | David Stanley |
34 | Jhaveri, Sneha | David Stanley |
35 | Gaul, Barbara | David Stanley |
36 | Sutwell, Barbara | David Stanley |
37 | Robinson, Elias | David Stanley |
38 | Ferreira, Violeta | David Stanley |
39 | Johnston, Yen | David Stanley |
40 | King, Janet | David Stanley |
41 | Chang, Donovan E | David Stanley |
42 | Lunquist, Lisa | David Stanley |
43 | Burkett, Benjamin | David Stanley |
44 | Medeiros, Jennifer | David Stanley |
45 | Fernandes, Nilson | David Stanley |
46 | Good, Susan | Kelley Spirea |
47 | Chace, Beatrice | Kelley Spirea |
48 | Kretschmer, John | Kelley Spirea |
49 | Sewkumar, Nori | Kelley Spirea |
50 | Engdahl, Jean | Kelley Spirea |
51 | Cockel, James | Kelley Spirea |
52 | Goyal, Roxana | Kelley Spirea |
53 | Moumanil, Maliki | Kelley Spirea |
54 | Jackson, Maryellen | Kelley Spirea |
55 | Alagbe,Trina | Kelley Spirea |
56 | Punjabhi, Louis | Kelley Spirea |
57 | Linden, Mathew | Kelley Spirea |
58 | Cornett, Lisa | Kelley Spirea |
59 | Petingill, Shana | Kelley Spirea |
60 | Keatts, Kramer | Elijiah Gray |
61 | Dobrin, Denisa S | Elijiah Gray |
62 | Langton, Enrico | Elijiah Gray |
63 | Gosciminski, Phylicia | Elijiah Gray |
64 | Knapp, Bradley J | Elijiah Gray |
65 | Carey, Michael | Elijiah Gray |
66 | Harrell, Ludwick | Elijiah Gray |
67 | Rose, Ashley | Elijiah Gray |
68 | Zima, Colleen | Elijiah Gray |
69 | Peterson, Kayla | Elijiah Gray |
70 | Blount, Dianna | Elijiah Gray |
71 | Gentry, Mildred | Elijiah Gray |
72 | Pitt, Brad | Elijiah Gray |
73 | McCarthy, Brigit | Elijiah Gray |
74 | Davis, Daniel | Michael Albert |
75 | Erilus, Angela | Michael Albert |
76 | Cierpiszewski, Caroline | Michael Albert |
77 | Gold, Shenice | Michael Albert |
78 | Bernstein, Sean | Michael Albert |
79 | Monterro, Luisa | Michael Albert |
80 | Newman, Richard | Michael Albert |
81 | Motlagh, Dawn | Michael Albert |
82 | Buccheri, Joseph | Michael Albert |
83 | Nguyen, Lei-Ming | Michael Albert |
84 | Garneau, Hamish | Michael Albert |
85 | Garcia, Raul | Michael Albert |
86 | Manchester, Robyn | Michael Albert |
87 | Smith, Sade | Michael Albert |
88 | Wolk, Hang T | Ketsia Liebig |
89 | Rivera, Haley | Ketsia Liebig |
90 | Fidelia, Libby | Ketsia Liebig |
91 | Jacobi, Hannah | Ketsia Liebig |
92 | Stanford,Barbara M | Ketsia Liebig |
93 | Sander, Kamrin | Ketsia Liebig |
94 | Burke, Joelle | Ketsia Liebig |
95 | Shields, Seffi | Ketsia Liebig |
96 | Beatrice, Courtney | Ketsia Liebig |
97 | Lydon, Allison | Ketsia Liebig |
98 | Maurice, Shana | Ketsia Liebig |
99 | England, Rex | Ketsia Liebig |
100 | Owad, Clinton | Ketsia Liebig |
101 | DiNocco, Lily | Ketsia Liebig |
102 | Gordon, David | Webster Butler |
103 | Saar-Beckles, Melinda | Webster Butler |
104 | Dickinson, Geoff | Webster Butler |
105 | Moran, Patrick | Webster Butler |
106 | Mckenna, Sandy | Webster Butler |
107 | Bachiochi, Linda | Webster Butler |
108 | Hunts, Julissa | Webster Butler |
109 | Mangal, Debbie | Webster Butler |
110 | Sweetwater, Alex | Webster Butler |
111 | Von Massenbach, Anna | Webster Butler |
112 | Becker, Scott | Webster Butler |
113 | Nowlan, Kristie | Webster Butler |
114 | Tippett, Jeanette | Webster Butler |
115 | Desimone, Carl | Webster Butler |