Create Spark Data Frames from Csv and Excel files

 




Create a Spark Data Frame from a CSV File 

from pyspark.context import SparkContext
from pyspark.sql.session import SparkSession
sc = SparkContext.getOrCreate()
spark = SparkSession(sc)

df1 = spark.read.format("csv") \
.option("inferschema","True") \
.option("header","True") \
.load("/FileStore/tables/insurance.csv")
df1.show(5000)



Create a Pandas Data Frame from a CSV File 

import pandas as pd
pd_df = pd.read_csv('/FileStore/tables/insurance.csv')
print(pd_df)



Create a Pandas Data Frame from a Excel File 


import pandas as pd
pd_df = pd.read_excel('C:\\Move\\test.xlsx')
#print(pd_df)

from pyspark.context import SparkContext
from pyspark.sql.session import SparkSession
sc = SparkContext.getOrCreate()
spark = SparkSession(sc)

# Before that make sure to install PyArrow
# PIP INSTALL PyArrow

spark.conf.set("spark.sql.execution.arrow.enabled","true")
sparkDF=spark.createDataFrame(pd_df)
#sparkDF.printSchema()
sparkDF.show(500)




No comments: