Monday, December 11, 2023

2nd Highest Salary by dataframe Dense_rank Function

from pyspark.sql import SparkSession from pyspark.sql.window import Window from pyspark.sql.functions import row_number, rank, dense_rank, lag, lead, col from pyspark.sql.types import StructType,StructField,IntegerType,StringType,StringType,BooleanType,DateType,IntegerType spark = SparkSession.builder.appName('2nd_highest_salary').master("local[*]").getOrCreate() spark.sparkContext.setLogLevel("ERROR") emppath = "E:\\source_files\\employees.csv" empdf = spark.read.format("csv").option("header","true").option("inferSchema", "true").option("sep", ",").load(emppath) #empdf.show() empdf.printSchema() #Create window specification for applying window function empdf.orderBy(col("salary").desc()) #Q1.Findout Departmentwise Highest Salary window_spec = Window.partitionBy("department_id").orderBy(col("salary").desc()) #NB : Without Col("Salary") , .desc() method will not be available. #Apply the window specification result_df = empdf.withColumn("drank",dense_rank().over(window_spec)).where(col("drank")==1) # result_df.show() #Q1.Findout the 2nd Highest Salary in entire employee table , just remove the column_name from partitionBy win_spec = Window.partitionBy().orderBy(col("salary").desc()) emp_df_with_drank_column = empdf.withColumn("drank",dense_rank().over(win_spec)) #emp_df_with_drank_column.show() +-----------+----------+---------+--------+------------------+---------+-------+------+--------------+----------+-------------+-----+ |EMPLOYEE_ID|FIRST_NAME|LAST_NAME| EMAIL| PHONE_NUMBER|HIRE_DATE| JOB_ID|SALARY|COMMISSION_PCT|MANAGER_ID|DEPARTMENT_ID|drank| +-----------+----------+---------+--------+------------------+---------+-------+------+--------------+----------+-------------+-----+ | 100| Steven| King| SKING| 515.123.4567| 17-06-03|AD_PRES| 24000| null| null| 90| 1| | 101| Neena| Kochhar|NKOCHHAR| 515.123.4568| 21-09-05| AD_VP| 17000| null| 100| 90| 2| | 102| Lex| De Haan| LDEHAAN| 515.123.4569| 13-01-01| AD_VP| 17000| null| 100| 90| 2| | 145| John| Russell| JRUSSEL|011.44.1344.429268| 01-10-04| SA_MAN| 14000| 0.4| 100| 80| 3| | 146| Karen| Partners|KPARTNER|011.44.1344.467268| 05-01-05| SA_MAN| 13500| 0.3| 100| 80| 4| | 201| Michael|Hartstein|MHARTSTE| 515.123.5555| 17-02-04| MK_MAN| 13000| null| 100| 20| 5| | 205| Shelley| Higgins|SHIGGINS| 515.123.8080| 07-06-02| AC_MGR| 12008| null| 101| 110| 6| | 108| Nancy|Greenberg|NGREENBE| 515.124.4569| 17-08-02| FI_MGR| 12008| null| 101| 100| 6| | 147| Alberto|Errazuriz|AERRAZUR|011.44.1344.429278| 10-03-05| SA_MAN| 12000| 0.3| 100| 80| 7| | 168| Lisa| Ozer| LOZER|011.44.1343.929268| 11-03-05| SA_REP| 11500| 0.25| 148| 80| 8| | 114| Den| Raphaely|DRAPHEAL| 515.127.4561| 07-12-02| PU_MAN| 11000| null| 100| 30| 9| | 148| Gerald|Cambrault|GCAMBRAU|011.44.1344.619268| 15-10-07| SA_MAN| 11000| 0.3| 100| 80| 9| | 174| Ellen| Abel| EABEL|011.44.1644.429267| 11-05-04| SA_REP| 11000| 0.3| 149| 80| 9| | 149| Eleni| Zlotkey|EZLOTKEY|011.44.1344.429018| 29-01-08| SA_MAN| 10500| 0.2| 100| 80| 10| | 162| Clara| Vishney|CVISHNEY|011.44.1346.129268| 11-11-05| SA_REP| 10500| 0.25| 147| 80| 10| | 204| Hermann| Baer| HBAER| 515.123.8888| 07-06-02| PR_REP| 10000| null| 101| 70| 11| | 150| Peter| Tucker| PTUCKER|011.44.1344.129268| 30-01-05| SA_REP| 10000| 0.3| 145| 80| 11| | 156| Janette| King| JKING|011.44.1345.429268| 30-01-04| SA_REP| 10000| 0.35| 146| 80| 11| | 169| Harrison| Bloom| HBLOOM|011.44.1343.829268| 23-03-06| SA_REP| 10000| 0.2| 148| 80| 11| | 170| Tayler| Fox| TFOX|011.44.1343.729268| 24-01-06| SA_REP| 9600| 0.2| 148| 80| 12| +-----------+----------+---------+--------+------------------+---------+-------+------+--------------+----------+-------------+-----+ #Add drank == 2 filter condition to get the desired result. result_df = empdf.withColumn( "drank",dense_rank().over(win_spec) ).where(col("drank")==2) result_df.show() +-----------+----------+---------+--------+------------+---------+------+------+--------------+----------+-------------+-----+ |EMPLOYEE_ID|FIRST_NAME|LAST_NAME| EMAIL|PHONE_NUMBER|HIRE_DATE|JOB_ID|SALARY|COMMISSION_PCT|MANAGER_ID|DEPARTMENT_ID|drank| +-----------+----------+---------+--------+------------+---------+------+------+--------------+----------+-------------+-----+ | 101| Neena| Kochhar|NKOCHHAR|515.123.4568| 21-09-05| AD_VP| 17000| null| 100| 90| 2| | 102| Lex| De Haan| LDEHAAN|515.123.4569| 13-01-01| AD_VP| 17000| null| 100| 90| 2| +-----------+----------+---------+--------+------------+---------+------+------+--------------+----------+-------------+-----+

No comments:

Post a Comment