Spark高级数据分析· 6LSA
wget http://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles-multistream.xml.bz2
1 获取数据
def readFile(path: String, sc: SparkContext): RDD[String] = {
val conf = new Configuration()
conf.set(XmlInputFormat.START_TAG_KEY, "<page>")
conf.set(XmlInputFormat.END_TAG_KEY, "</page>")
val rawXmls = sc.newAPIHadoopFile(path, classOf[XmlInp
