from pyspark.sql import SparkSession
from pyspark.sql.window import Window
from pyspark.sql.functions import row_number, rank, dense_rank, lag, lead, col
from pyspark.sql.types import StructType,StructField,IntegerType,StringType,StringType,BooleanType,DateType,IntegerType
spark = SparkSession.builder.appName('2nd_highest_salary').master("local[*]").getOrCreate()
spark.sparkContext.setLogLevel("ERROR")
emppath = "E:\\source_files\\employees.csv"
empdf = spark.read.format("csv").option("header","true").option("inferSchema", "true").option("sep", ",").load(emppath)
#empdf.show()
empdf.printSchema()
#Create window specification for applying window function
empdf.orderBy(col("salary").desc())
#Q1.Findout Departmentwise Highest Salary
window_spec = Window.partitionBy("department_id").orderBy(col("salary").desc())
#NB : Without Col("Salary") , .desc() method will not be available.
#Apply the window specification
result_df = empdf.withColumn("drank",dense_rank().over(window_spec)).where(col("drank")==1)
# result_df.show()
#Q1.Findout the 2nd Highest Salary in entire employee table , just remove the column_name from partitionBy
win_spec = Window.partitionBy().orderBy(col("salary").desc())
emp_df_with_drank_column = empdf.withColumn("drank",dense_rank().over(win_spec))
#emp_df_with_drank_column.show()
+-----------+----------+---------+--------+------------------+---------+-------+------+--------------+----------+-------------+-----+
|EMPLOYEE_ID|FIRST_NAME|LAST_NAME| EMAIL| PHONE_NUMBER|HIRE_DATE| JOB_ID|SALARY|COMMISSION_PCT|MANAGER_ID|DEPARTMENT_ID|drank|
+-----------+----------+---------+--------+------------------+---------+-------+------+--------------+----------+-------------+-----+
| 100| Steven| King| SKING| 515.123.4567| 17-06-03|AD_PRES| 24000| null| null| 90| 1|
| 101| Neena| Kochhar|NKOCHHAR| 515.123.4568| 21-09-05| AD_VP| 17000| null| 100| 90| 2|
| 102| Lex| De Haan| LDEHAAN| 515.123.4569| 13-01-01| AD_VP| 17000| null| 100| 90| 2|
| 145| John| Russell| JRUSSEL|011.44.1344.429268| 01-10-04| SA_MAN| 14000| 0.4| 100| 80| 3|
| 146| Karen| Partners|KPARTNER|011.44.1344.467268| 05-01-05| SA_MAN| 13500| 0.3| 100| 80| 4|
| 201| Michael|Hartstein|MHARTSTE| 515.123.5555| 17-02-04| MK_MAN| 13000| null| 100| 20| 5|
| 205| Shelley| Higgins|SHIGGINS| 515.123.8080| 07-06-02| AC_MGR| 12008| null| 101| 110| 6|
| 108| Nancy|Greenberg|NGREENBE| 515.124.4569| 17-08-02| FI_MGR| 12008| null| 101| 100| 6|
| 147| Alberto|Errazuriz|AERRAZUR|011.44.1344.429278| 10-03-05| SA_MAN| 12000| 0.3| 100| 80| 7|
| 168| Lisa| Ozer| LOZER|011.44.1343.929268| 11-03-05| SA_REP| 11500| 0.25| 148| 80| 8|
| 114| Den| Raphaely|DRAPHEAL| 515.127.4561| 07-12-02| PU_MAN| 11000| null| 100| 30| 9|
| 148| Gerald|Cambrault|GCAMBRAU|011.44.1344.619268| 15-10-07| SA_MAN| 11000| 0.3| 100| 80| 9|
| 174| Ellen| Abel| EABEL|011.44.1644.429267| 11-05-04| SA_REP| 11000| 0.3| 149| 80| 9|
| 149| Eleni| Zlotkey|EZLOTKEY|011.44.1344.429018| 29-01-08| SA_MAN| 10500| 0.2| 100| 80| 10|
| 162| Clara| Vishney|CVISHNEY|011.44.1346.129268| 11-11-05| SA_REP| 10500| 0.25| 147| 80| 10|
| 204| Hermann| Baer| HBAER| 515.123.8888| 07-06-02| PR_REP| 10000| null| 101| 70| 11|
| 150| Peter| Tucker| PTUCKER|011.44.1344.129268| 30-01-05| SA_REP| 10000| 0.3| 145| 80| 11|
| 156| Janette| King| JKING|011.44.1345.429268| 30-01-04| SA_REP| 10000| 0.35| 146| 80| 11|
| 169| Harrison| Bloom| HBLOOM|011.44.1343.829268| 23-03-06| SA_REP| 10000| 0.2| 148| 80| 11|
| 170| Tayler| Fox| TFOX|011.44.1343.729268| 24-01-06| SA_REP| 9600| 0.2| 148| 80| 12|
+-----------+----------+---------+--------+------------------+---------+-------+------+--------------+----------+-------------+-----+
#Add drank == 2 filter condition to get the desired result.
result_df = empdf.withColumn( "drank",dense_rank().over(win_spec) ).where(col("drank")==2)
result_df.show()
+-----------+----------+---------+--------+------------+---------+------+------+--------------+----------+-------------+-----+
|EMPLOYEE_ID|FIRST_NAME|LAST_NAME| EMAIL|PHONE_NUMBER|HIRE_DATE|JOB_ID|SALARY|COMMISSION_PCT|MANAGER_ID|DEPARTMENT_ID|drank|
+-----------+----------+---------+--------+------------+---------+------+------+--------------+----------+-------------+-----+
| 101| Neena| Kochhar|NKOCHHAR|515.123.4568| 21-09-05| AD_VP| 17000| null| 100| 90| 2|
| 102| Lex| De Haan| LDEHAAN|515.123.4569| 13-01-01| AD_VP| 17000| null| 100| 90| 2|
+-----------+----------+---------+--------+------------+---------+------+------+--------------+----------+-------------+-----+
No comments:
Post a Comment