#1st Simple Glue Job
import sys
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from pyspark.sql.functions import *
from awsglue.context import GlueContext
from awsglue.job import Job
## @params: [JOB_NAME]
args = getResolvedOptions(sys.argv, ['JOB_NAME'])
sc = SparkContext()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
data ="s3://rishita2023poc/input/bank-full.csv"
# \ Slash symobol should be given after Parenthesis closure bracket but before '.' symobol.
adf= spark.read.format ("csv")\
.option("header","true")\
.option("inferSchema","true")\
.option("sep",";")\
.load(data)
adf.show()
res=adf.where(col("age")>60)
op="s3://rishita2023poc/output"
res.write.format("csv") \
.option("header","true")\
.mode("overwrite")\
.save(op)
'''
### To Store the output to MYSQL RDS db ,add the dependency jar file from S3 Bucket path in dependency section.
#Dependent Jars path: s3://rishita2023poc/drivers/mysql-connector-java-8.0.12.jar
host="jdbc:mysql://mysqldb.cwkqaojgxfrd.ap-south-1.rds.amazonaws.com:3306/newdb"
res.write.format("jdbc")\
.option("url",host)\
.option("user","myuser")\
.option("password","mypassword")\
.option("driver","com.mysql.cj.jdbc.Driver")\
.option("dbtable","gluebank60")\
.save()
'''
job = Job(glueContext)
job.init(args['JOB_NAME'], args)
job.commit()
No comments:
Post a Comment