import datetime
import time
from pyspark.sql import functions as F
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql.functions import explode
from pyspark.sql.functions import split
from pyspark.sql.functions import regexp_replace
from pyspark.sql.functions import concat
from pyspark.sql.functions import col
from pyspark.sql.functions import lit
from pyspark.sql.functions import trim
from pyspark.sql import Row
from pyspark.sql.functions import udf
from pyspark.sql.functions import to_date
from operator import add
from functools import reduce
from pyspark import SparkConf
from pyspark.sql.functions import unix_timestamp
from pyspark.sql.functions import from_unixtime
import boto3
import os
from datetime import tzinfo, timedelta, datetime, date
import sys
from awsglue.transforms import *
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job
from pyspark.sql import SparkSession
from pyspark.sql import SQLContext
import json
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from awsglue.dynamicframe import DynamicFrame


## Get the argument list
args = getResolvedOptions(sys.argv,['JOB_NAME','secret_id','bucket','source_prefix','source_prefix_common','source_filename','rolename','temp_folder_name','redshift_database','redshift_table','redshift_delete_table'])

today1 = date.today()
curr_dt = today1.strftime("%Y-%m-%d")
curr_dt1 = today1.strftime("%Y%m%d")
current_date = datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S')
spark = SparkSession.builder.getOrCreate()

## Steps to initiate spark context, glue context and connecting SC to Glue
sc = spark.sparkContext
glueContext = GlueContext(sc)
job = Job(glueContext)
job.init(args['JOB_NAME'])
sqlContext = SQLContext(sc)

# Initiate connection to S3 using boto3
s3 = boto3.resource('s3')
bucketname = args['bucket']
source_prefix = args['source_prefix']
source_prefix_common = args['source_prefix_common']
source_filename = args['source_filename']


source_path = "s3://" + bucketname + "/" + source_prefix + "/" + source_filename  # "s3://qa-consumer-business/AuctionNet/staging/parquet_files/Anet_Data_Hyundai.parquet/"


### Setting up secret manager config
secret_id = args['secret_id']

# Getting DB credentials from Secrets Manager
client = boto3.client("secretsmanager", region_name="us-west-2")

get_secret_value_response = client.get_secret_value(
    SecretId=secret_id
)

secret = get_secret_value_response['SecretString']
secret = json.loads(secret)

username = secret.get('username')
password = secret.get('password')
auth_id = secret.get('auth_id')
connection_string = secret.get('connection_string')

url = connection_string + "user=" + username + "&password=" + password
rolename = args['rolename']
authorization = "arn:aws:iam::" + auth_id + ":role/" + rolename
temp_folder_name = args['temp_folder_name']
temp_directory = "s3://" + bucketname + "/" + temp_folder_name
redshift_database = args['redshift_database']
redshift_table = args['redshift_table']
redshift_delete_table = args['redshift_delete_table']
redshift_target_table_path = redshift_database + "." + redshift_table
redshift_delete_table_path = redshift_database + "." + redshift_delete_table

# Read the data from Parquet files and selecting the necessary columns

df_srctmp_Anet_Data_Hyundai = spark.read.option("header", "true").parquet(source_path)

for colname in df_srctmp_Anet_Data_Hyundai.columns:
        df_srctmp_Anet_Data_Hyundai = df_srctmp_Anet_Data_Hyundai.withColumn(colname,trim(col(colname)))


print("extraction from source system : " + str(df_srctmp_Anet_Data_Hyundai.count()))
print("source file data is loaded into dataframe")

# concatenating the _source to the columns

new_column_name_list = list(map(lambda x: x + '_source', df_srctmp_Anet_Data_Hyundai.columns))

df_source_final = df_srctmp_Anet_Data_Hyundai.toDF(*new_column_name_list)

print("Target table extraction completed")

df_0_schema = df_source_final.withColumn("auct_sk_dummy", lit(1))
rdd_1 = df_source_final.rdd.zipWithIndex().map(lambda row_rowId: (list(row_rowId[0]) + [row_rowId[1] + 1]))
df_intr_insert_seq = sqlContext.createDataFrame(rdd_1, schema=df_0_schema.schema)

print("seq is generated")

df_seqnum = sqlContext.read.format("com.databricks.spark.redshift").option("url", url).option("aws_iam_role", authorization).option("tempdir", temp_directory).option("query", "select max(auct_sk) as auct_sk1 from "+ redshift_database+"."+redshift_table).load()
df_seqnum_tmp = df_seqnum.withColumn('auct_sk_max', when(df_seqnum.auct_sk1.isNull(), 0).otherwise(df_seqnum.auct_sk1))
df_seqnum_max = df_seqnum_tmp.selectExpr("auct_sk_max")

df_intr_cross_join = df_intr_insert_seq.crossJoin(df_seqnum_max)
df_insert_final = df_intr_cross_join.withColumn('auct_sk',df_intr_cross_join.auct_sk_dummy + df_intr_cross_join.auct_sk_max).withColumn('today_dt', lit(current_date).cast(TimestampType())).withColumn('eff_from_date', lit(curr_dt)).withColumn('eff_from_dtstmp', lit(curr_dt1)).withColumn('update_dt', lit(None).cast(TimestampType())).withColumn('current_flg',lit('Y')).selectExpr(
"auct_sk", "VIN_source as vin", "SALE_WEEK_source as sale_week", "eff_from_date as eff_from_date",
"ANET_REGION_source as anet_region", "NADA_REGION_source as nada_region", "SALE_PRICE_source as sale_price",
"MILEAGE_source as mileage", "SALE_TYPE_source as sale_type_cd", "EXTERIOR_COLOR_source as ext_color",
"INTERIOR_COLOR_source as int_color", "TRIM_source as trim_desc", "DUPLICATE_VIN_INDICATOR_source as dup_vin_ind",
"TRIM_DECODE_INDICATOR_source as trim_decode_ind", "VIN_DECODE_ID_source as veh_id",
"TRIM_DECODE_ID_source as trim_id", "LEGACY_DECODE_ID_source as legacy_id", "eff_from_dtstmp as eff_from_dtstmp",
"today_dt as sys_ins_date", "update_dt as sys_upd_date", "current_flg as is_cur_ind")
df_insert_final = df_insert_final.dropDuplicates()

df_insert_final1=df_insert_final.select(df_insert_final.auct_sk.cast('integer'),
df_insert_final.vin.cast('string'),
df_insert_final.sale_week.cast('date'),
df_insert_final.eff_from_date.cast('date'),
df_insert_final.anet_region.cast('string'),
df_insert_final.nada_region.cast('integer'),
df_insert_final.sale_price.cast('integer'),
df_insert_final.mileage.cast('integer'),
df_insert_final.sale_type_cd.cast('string'),
df_insert_final.ext_color.cast('string'),
df_insert_final.int_color.cast('string'),
df_insert_final.trim_desc.cast('string'),
df_insert_final.dup_vin_ind.cast('string'),
df_insert_final.trim_decode_ind.cast('string'),
df_insert_final.veh_id.cast('integer'),
df_insert_final.trim_id.cast('integer'),
df_insert_final.legacy_id.cast('integer'),
df_insert_final.eff_from_dtstmp.cast('integer'),
df_insert_final.sys_ins_date.cast('timestamp'),
df_insert_final.sys_upd_date.cast('timestamp'),
df_insert_final.is_cur_ind.cast('string'))

{
    df_insert_final1.write
        .format("com.databricks.spark.redshift")
        .option("url", url)
        .option("dbtable", redshift_target_table_path)
        .option("include_column_list", "true")
        .option("aws_iam_role", authorization)
        .option("preactions", "delete from " + redshift_target_table_path + " where auct_sk in (select auct_sk from " + redshift_database + ".etl_stg_delete_anet_data_hyundai)")
        .option("tempdir", temp_directory)
        .save(mode='append')
}

print('successful Consumer Auction data is loaded')

 

Python Online Compiler

Write, Run & Share Python code online using OneCompiler's Python online compiler for free. It's one of the robust, feature-rich online compilers for python language, supporting both the versions which are Python 3 and Python 2.7. Getting started with the OneCompiler's Python editor is easy and fast. The editor shows sample boilerplate code when you choose language as Python or Python2 and start coding.

Taking inputs (stdin)

OneCompiler's python online editor supports stdin and users can give inputs to programs using the STDIN textbox under the I/O tab. Following is a sample python program which takes name as input and print your name with hello.

import sys
name = sys.stdin.readline()
print("Hello "+ name)

About Python

Python is a very popular general-purpose programming language which was created by Guido van Rossum, and released in 1991. It is very popular for web development and you can build almost anything like mobile apps, web apps, tools, data analytics, machine learning etc. It is designed to be simple and easy like english language. It's is highly productive and efficient making it a very popular language.

Tutorial & Syntax help

Loops

1. If-Else:

When ever you want to perform a set of operations based on a condition IF-ELSE is used.

if conditional-expression
    #code
elif conditional-expression
    #code
else:
    #code

Note:

Indentation is very important in Python, make sure the indentation is followed correctly

2. For:

For loop is used to iterate over arrays(list, tuple, set, dictionary) or strings.

Example:

mylist=("Iphone","Pixel","Samsung")
for i in mylist:
    print(i)

3. While:

While is also used to iterate a set of statements based on a condition. Usually while is preferred when number of iterations are not known in advance.

while condition  
    #code 

Collections

There are four types of collections in Python.

1. List:

List is a collection which is ordered and can be changed. Lists are specified in square brackets.

Example:

mylist=["iPhone","Pixel","Samsung"]
print(mylist)

2. Tuple:

Tuple is a collection which is ordered and can not be changed. Tuples are specified in round brackets.

Example:

myTuple=("iPhone","Pixel","Samsung")
print(myTuple)

Below throws an error if you assign another value to tuple again.

myTuple=("iPhone","Pixel","Samsung")
print(myTuple)
myTuple[1]="onePlus"
print(myTuple)

3. Set:

Set is a collection which is unordered and unindexed. Sets are specified in curly brackets.

Example:

myset = {"iPhone","Pixel","Samsung"}
print(myset)

4. Dictionary:

Dictionary is a collection of key value pairs which is unordered, can be changed, and indexed. They are written in curly brackets with key - value pairs.

Example:

mydict = {
    "brand" :"iPhone",
    "model": "iPhone 11"
}
print(mydict)

Supported Libraries

Following are the libraries supported by OneCompiler's Python compiler

NameDescription
NumPyNumPy python library helps users to work on arrays with ease
SciPySciPy is a scientific computation library which depends on NumPy for convenient and fast N-dimensional array manipulation
SKLearn/Scikit-learnScikit-learn or Scikit-learn is the most useful library for machine learning in Python
PandasPandas is the most efficient Python library for data manipulation and analysis
DOcplexDOcplex is IBM Decision Optimization CPLEX Modeling for Python, is a library composed of Mathematical Programming Modeling and Constraint Programming Modeling