Scala Parser for YAML File


import java.io.{File, FileInputStream}
import java.text.SimpleDateFormat
import org.yaml.snakeyaml.Yaml
import org.yaml.snakeyaml.constructor.Constructor

object Main {
def main(args: Array[String]): Unit = {
val text = scala.io.Source.fromInputStream(getClass.getResourceAsStream(
"codes.yaml")).mkString
val yaml = new Yaml
val obj = yaml.load(text)
val codes = obj.asInstanceOf[java.util.LinkedHashMap[String,java.util.Map[String, Any]]]
var values = codes.get("duration").get("values").asInstanceOf[java.util.ArrayList
[java.util.Map[String,Any]]]
values.toArray.foreach(c=> println(c.asInstanceOf[java.util.LinkedHashMap[String,Any]]
.get("code")))
}
}

codes.yaml:

duration:
values:
-
code: 1
from: 0
to: 1
-
code: 2
from: 1
to: 3

build.sbt

libraryDependencies += "org.yaml" % "snakeyaml" % "1.8"

Advertisements

WrodCount MapReduce with Scalding

git clone https://github.com/scalding-io/ProgrammingWithScalding

cd ProgrammingWithScalding/chapter2/

mvn clean install

hadoop fs -mkdir -p /data/input

hadoop fs -mkdir -p /data/output

echo “This is a happy day. A day to remember” > /tmp/input.txt

hadoop fs -put /tmp/input.txt /data/input

hadoop jar /root/repo/ProgrammingWithScalding/chapter2/target/chapter2-0-jar-with-dependencies.jar com.twitter.scalding.Tool WordCountJob –local –input /data/input/input.txt –output /data/output/output.txt

cat /data/output/output.txt
a 2
day 1
day. 1
happy 1
is 1
remember 1
this 1
to 1

Wordcount Example With Apache Spark

mkdir wordcount-spark
cd wordcount-spark

mkdir -p src/main/scala

cat <<EOF > build.sbt
name := "wordcount"
version := "1.0"
scalaVersion := "2.11.8"
libraryDependencies += "org.apache.spark" %% "spark-core" % "1.6.0"
EOF

touch src/main/scala/SparkWordCount.scala

import org.apache.spark.SparkContext
import org.apache.spark.SparkContext._
import org.apache.spark.SparkConf

object SparkWordCount {
def main(args: Array[String]) {
// create Spark context with Spark configuration
val sc = new SparkContext(new SparkConf().setAppName("Spark Count"))

// get threshold
val threshold = args(1).toInt

// read in text file and split each document into words
val tokenized = sc.textFile(args(0)).flatMap(_.split(" "))

// count the occurrence of each word
val wordCounts = tokenized.map((_, 1)).reduceByKey(_ + _)

// filter out words with fewer than threshold occurrences
val filtered = wordCounts.filter(_._2 >= threshold)

// count characters
val charCounts = filtered.flatMap(_._1.toCharArray).map((_, 1)).reduceByKey(_ + _)

System.out.println(charCounts.collect().mkString(", "))
}
}

sbt package

cat <<EOF > /tmp/wordcount.txt
Hello world, Hello
EOF

cp target/scala-2.11/workcount_2.11-1.0.jar /tmp/

cd $SPARK_HOME
./bin/spark-submit --master "local[*]" --class SparkWordCount /tmp/wordcount_2.11-1.0.jar /tmp/wordcount.txt 1