Sunday, September 15, 2024

Pyspark Program for 2nd highest salary deptwise

 

from pyspark.sql import SparkSession
import pyspark.sql.functions as F
#from pyspark.sql.functions import row_number, rank, dense_rank, lag, lead, col
from pyspark.sql.window import Window

# Create a SparkSession
spark = SparkSession.builder.appName('2nd_highest_salary').master("local[*]").getOrCreate()
print(f'The PySpark {spark.version} version is running...')
'''
spark.sparkContext.setLogLevel("ERROR")
emppath = "E:\\source_files\\employees.csv"
empdf = spark.read.format("csv").option("header","true").option("inferSchema", "true").option("sep", ",").load(emppath)
empdf.write.format("parquet").mode("overwrite").save("E:\\target_files\\emp.parquet")
empdf.write.format("parquet").saveAsTable("my_table")

'''
# Read the CSV file (replace "your_csv_file.csv" with your actual file)
df = spark.read.csv("E:\\source_files\\employees.csv", header=True, inferSchema=True)

# Assuming columns are named "department" and "salary"
windowSpec = Window.partitionBy("department_id").orderBy(F.desc("salary"))
#window_spec = Window.partitionBy("department_id").orderBy(col("salary").desc())

# Calculate the dense rank of each salary within its department
df = df.withColumn("rank", F.dense_rank().over(windowSpec))

# Filter for rows with a rank of 2 (second-highest)
second_highest_deptwise_salaries = df.filter(F.col("rank") == 2)

# Show the results
second_highest_deptwise_salaries.show()

'''
#Q1.Findout the 2nd Highest Salary in entire employee table , just remove the column_name from partitionBy
win_spec = Window.partitionBy().orderBy(col("salary").desc())
emp_df_with_drank_column = empdf.withColumn("drank",dense_rank().over(win_spec))
#emp_df_with_drank_column.show()
#Add drank == 2 filter condition to get the desired result.
result_df = empdf.withColumn( "drank",dense_rank().over(win_spec) ).where(col("drank")==2)
result_df.show()
'''

'''
+-----------+----------+---------+--------+------------------+---------+----------+------+--------------+----------+-------------+----+
| 202| Pat| Fay| PFAY| 603.123.6666| 17-08-05| MK_REP| 6000| NULL| 201| 20| 2|
| 115| Alexander| Khoo| AKHOO| 515.127.4562| 18-05-03| PU_CLERK| 3100| NULL| 114| 30| 2|
| 120| Matthew| Weiss| MWEISS| 650.123.1234| 18-07-04| ST_MAN| 8000| NULL| 100| 50| 2|
| 104| Bruce| Ernst| BERNST| 590.423.4568| 21-05-07| IT_PROG| 6000| NULL| 103| 60| 2|
| 146| Karen| Partners|KPARTNER|011.44.1344.467268| 05-01-05| SA_MAN| 13500| 0.3| 100| 80| 2|
| 101| Neena| Kochhar|NKOCHHAR| 515.123.4568| 21-09-05| AD_VP| 17000| NULL| 100| 90| 2|
| 102| Lex| De Haan| LDEHAAN| 515.123.4569| 13-01-01| AD_VP| 17000| NULL| 100| 90| 2|
| 109| Daniel| Faviet| DFAVIET| 515.124.4169| 16-08-02|FI_ACCOUNT| 9000| NULL| 108| 100| 2|
| 206| William| Gietz| WGIETZ| 515.123.8181| 07-06-02|AC_ACCOUNT| 8300| NULL| 205| 110| 2|
+-----------+----------+---------+--------+------------------+---------+----------+------+--------------+----------+-------------+----+


'''


No comments:

Post a Comment