Wordcount Example With Apache Spark

mkdir wordcount-spark
cd wordcount-spark

mkdir -p src/main/scala

cat <<EOF > build.sbt
name := "wordcount"
version := "1.0"
scalaVersion := "2.11.8"
libraryDependencies += "org.apache.spark" %% "spark-core" % "1.6.0"

touch src/main/scala/SparkWordCount.scala

import org.apache.spark.SparkContext
import org.apache.spark.SparkContext._
import org.apache.spark.SparkConf

object SparkWordCount {
def main(args: Array[String]) {
// create Spark context with Spark configuration
val sc = new SparkContext(new SparkConf().setAppName("Spark Count"))

// get threshold
val threshold = args(1).toInt

// read in text file and split each document into words
val tokenized = sc.textFile(args(0)).flatMap(_.split(" "))

// count the occurrence of each word
val wordCounts = tokenized.map((_, 1)).reduceByKey(_ + _)

// filter out words with fewer than threshold occurrences
val filtered = wordCounts.filter(_._2 >= threshold)

// count characters
val charCounts = filtered.flatMap(_._1.toCharArray).map((_, 1)).reduceByKey(_ + _)

System.out.println(charCounts.collect().mkString(", "))

sbt package

cat <<EOF > /tmp/wordcount.txt
Hello world, Hello

cp target/scala-2.11/workcount_2.11-1.0.jar /tmp/

./bin/spark-submit --master "local[*]" --class SparkWordCount /tmp/wordcount_2.11-1.0.jar /tmp/wordcount.txt 1


Leave a Reply

Fill in your details below or click an icon to log in:

WordPress.com Logo

You are commenting using your WordPress.com account. Log Out /  Change )

Google photo

You are commenting using your Google account. Log Out /  Change )

Twitter picture

You are commenting using your Twitter account. Log Out /  Change )

Facebook photo

You are commenting using your Facebook account. Log Out /  Change )

Connecting to %s