Tuesday, November 14, 2023

#1st Simple Glue Job

 #1st Simple Glue Job

import sys from awsglue.transforms import * from awsglue.utils import getResolvedOptions from pyspark.context import SparkContext from pyspark.sql.functions import * from awsglue.context import GlueContext from awsglue.job import Job ## @params: [JOB_NAME] args = getResolvedOptions(sys.argv, ['JOB_NAME']) sc = SparkContext() glueContext = GlueContext(sc) spark = glueContext.spark_session data ="s3://rishita2023poc/input/bank-full.csv" # \ Slash symobol should be given after Parenthesis closure bracket but before '.' symobol. adf= spark.read.format ("csv")\ .option("header","true")\ .option("inferSchema","true")\ .option("sep",";")\ .load(data) adf.show() res=adf.where(col("age")>60) op="s3://rishita2023poc/output" res.write.format("csv") \ .option("header","true")\ .mode("overwrite")\ .save(op) ''' ### To Store the output to MYSQL RDS db ,add the dependency jar file from S3 Bucket path in dependency section. #Dependent Jars path: s3://rishita2023poc/drivers/mysql-connector-java-8.0.12.jar host="jdbc:mysql://mysqldb.cwkqaojgxfrd.ap-south-1.rds.amazonaws.com:3306/newdb" res.write.format("jdbc")\ .option("url",host)\ .option("user","myuser")\ .option("password","mypassword")\ .option("driver","com.mysql.cj.jdbc.Driver")\ .option("dbtable","gluebank60")\ .save() ''' job = Job(glueContext) job.init(args['JOB_NAME'], args) job.commit()

No comments:

Post a Comment