<?xml version="1.0" encoding="utf-8"?> 
<rss version="2.0"
  xmlns:itunes="http://www.itunes.com/dtds/podcast-1.0.dtd"
  xmlns:atom="http://www.w3.org/2005/Atom">

<channel>

<title>Yuriy Gavrilov: posts tagged spark</title>
<link>https://gavrilov.info/tags/spark/</link>
<description>Welcome to my personal place for love, peace and happiness 🤖 Yuiry Gavrilov</description>
<author></author>
<language>en</language>
<generator>Aegea 11.4 (v4171e)</generator>

<itunes:owner>
<itunes:name></itunes:name>
<itunes:email>yvgavrilov@gmail.com</itunes:email>
</itunes:owner>
<itunes:subtitle>Welcome to my personal place for love, peace and happiness 🤖 Yuiry Gavrilov</itunes:subtitle>
<itunes:image href="https://gavrilov.info/pictures/userpic/userpic-square@2x.jpg?1643451008" />
<itunes:explicit>no</itunes:explicit>

<item>
<title>Чтение avro в spark из s3</title>
<guid isPermaLink="false">45</guid>
<link>https://gavrilov.info/all/chtenie-avro-v-spark-iz-s3/</link>
<pubDate>Tue, 28 Feb 2023 20:31:52 +0300</pubDate>
<author></author>
<comments>https://gavrilov.info/all/chtenie-avro-v-spark-iz-s3/</comments>
<description>
&lt;p&gt;import pyspark&lt;br /&gt;
import os&lt;/p&gt;
&lt;p&gt;S3_ACCESS_KEY = os.environ.get(“S3_ACCESS_KEY”)&lt;br /&gt;
S3_BUCKET = os.environ.get(“S3_BUCKET”)&lt;br /&gt;
S3_SECRET_KEY = os.environ.get(“S3_SECRET_KEY”)&lt;br /&gt;
S3_ENDPOINT = os.environ.get(“S3_ENDPOINT”)&lt;/p&gt;
&lt;h2&gt;This cell may take some time to run the first time, as it must download the necessary spark jars&lt;/h2&gt;
&lt;p&gt;conf = pyspark.SparkConf()&lt;/p&gt;
&lt;h3&gt;IF YOU ARE USING THE SPARK CONTAINERS, UNCOMMENT THE LINE BELOW TO OFFLOAD EXECUTION OF SPARK TASKS TO SPARK CONTAINERS&lt;/h3&gt;
&lt;p&gt;#conf.setMaster(“spark://spark:7077”)&lt;/p&gt;
&lt;p&gt;conf.set(“spark.jars.packages”, ‘org.apache.hadoop:hadoop-aws:3.3.1,io.delta:delta-core_2.12:2.1.0,org.apache.spark:spark-avro_2.12:3.3.2’)&lt;/p&gt;
&lt;h2&gt;conf.set(‘spark.hadoop.fs.s3a.aws.credentials.provider’, ‘org.apache.hadoop.fs.s3a.AnonymousAWSCredentialsProvider’)&lt;/h2&gt;
&lt;p&gt;conf.set(‘spark.hadoop.fs.s3a.endpoint’, S3_ENDPOINT)&lt;br /&gt;
conf.set(‘spark.hadoop.fs.s3a.access.key’, S3_ACCESS_KEY)&lt;br /&gt;
conf.set(‘spark.hadoop.fs.s3a.secret.key’, S3_SECRET_KEY)&lt;br /&gt;
conf.set(‘spark.hadoop.fs.s3a.path.style.access’, “true”)&lt;br /&gt;
conf.set(“spark.sql.extensions”, “io.delta.sql.DeltaSparkSessionExtension”)&lt;br /&gt;
conf.set(“spark.sql.catalog.spark_catalog”, “org.apache.spark.sql.delta.catalog.DeltaCatalog”)&lt;/p&gt;
&lt;p&gt;sc = pyspark.SparkContext(conf=conf)&lt;/p&gt;
&lt;h2&gt;sc.setLogLevel(“INFO”)&lt;/h2&gt;
&lt;p&gt;spark = pyspark.sql.SparkSession(sc)&lt;/p&gt;
&lt;p&gt;df = spark.read.format(“avro”).load(f“s3a://{S3_BUCKET}/person2.avro”)&lt;/p&gt;
</description>
</item>

<item>
<title>Чтение json в spark из s3</title>
<guid isPermaLink="false">44</guid>
<link>https://gavrilov.info/all/chtenie-json-v-spark-iz-s3/</link>
<pubDate>Mon, 27 Feb 2023 21:26:01 +0300</pubDate>
<author></author>
<comments>https://gavrilov.info/all/chtenie-json-v-spark-iz-s3/</comments>
<description>
&lt;p&gt;import pyspark&lt;br /&gt;
import os&lt;/p&gt;
&lt;p&gt;S3_ACCESS_KEY = os.environ.get(“S3_ACCESS_KEY”)&lt;br /&gt;
S3_BUCKET = os.environ.get(“S3_BUCKET”)&lt;br /&gt;
S3_SECRET_KEY = os.environ.get(“S3_SECRET_KEY”)&lt;br /&gt;
S3_ENDPOINT = os.environ.get(“S3_ENDPOINT”)&lt;/p&gt;
&lt;h2&gt;This cell may take some time to run the first time, as it must download the necessary spark jars&lt;/h2&gt;
&lt;p&gt;conf = pyspark.SparkConf()&lt;/p&gt;
&lt;h3&gt;IF YOU ARE USING THE SPARK CONTAINERS, UNCOMMENT THE LINE BELOW TO OFFLOAD EXECUTION OF SPARK TASKS TO SPARK CONTAINERS&lt;/h3&gt;
&lt;p&gt;#conf.setMaster(“spark://spark:7077”)&lt;/p&gt;
&lt;p&gt;conf.set(“spark.jars.packages”, ‘org.apache.hadoop:hadoop-aws:3.3.1,io.delta:delta-core_2.12:2.1.0,org.apache.spark:spark-avro_2.12:3.3.2’)&lt;/p&gt;
&lt;h2&gt;conf.set(‘spark.hadoop.fs.s3a.aws.credentials.provider’, ‘org.apache.hadoop.fs.s3a.AnonymousAWSCredentialsProvider’)&lt;/h2&gt;
&lt;p&gt;conf.set(‘spark.hadoop.fs.s3a.endpoint’, S3_ENDPOINT)&lt;br /&gt;
conf.set(‘spark.hadoop.fs.s3a.access.key’, S3_ACCESS_KEY)&lt;br /&gt;
conf.set(‘spark.hadoop.fs.s3a.secret.key’, S3_SECRET_KEY)&lt;br /&gt;
conf.set(‘spark.hadoop.fs.s3a.path.style.access’, “true”)&lt;br /&gt;
conf.set(“spark.sql.extensions”, “io.delta.sql.DeltaSparkSessionExtension”)&lt;br /&gt;
conf.set(“spark.sql.catalog.spark_catalog”, “org.apache.spark.sql.delta.catalog.DeltaCatalog”)&lt;/p&gt;
&lt;p&gt;sc = pyspark.SparkContext(conf=conf)&lt;/p&gt;
&lt;h2&gt;sc.setLogLevel(“INFO”)&lt;/h2&gt;
&lt;p&gt;spark = pyspark.sql.SparkSession(sc)&lt;/p&gt;
&lt;p&gt;df = spark.read.format(‘org.apache.spark.sql.json’).load(f“s3a://{S3_BUCKET}/apple3.json”)&lt;br /&gt;
df.show()&lt;/p&gt;
</description>
</item>


</channel>
</rss>