{"id":1357,"date":"2022-04-21T23:24:23","date_gmt":"2022-04-21T15:24:23","guid":{"rendered":"http:\/\/www.eait.co\/?p=1357"},"modified":"2022-04-21T23:24:23","modified_gmt":"2022-04-21T15:24:23","slug":"spark%e7%ae%97%e5%ad%90%e5%88%9d%e6%ad%a5","status":"publish","type":"post","link":"https:\/\/notes.coremix.net\/?p=1357","title":{"rendered":"spark\u7b97\u5b50\u521d\u6b65"},"content":{"rendered":"<p>\u79c9\u627f\u5f52\u96f6\u5fc3\u6001\u662f\u5b66\u4e60\u7f16\u7a0b\u5fc5\u5907\u6001\u5ea6\u7684\u539f\u5219\uff0c\u8fd9\u6b21scala\u4e5f\u4f7f\u7528python\u7684\u65b9\u6cd5\uff0c\u628a\u4ee3\u7801\u6572\u4e00\u904d\uff0c\u6240\u4ee5\uff0c\u5c31\u6709\u4e86\u4e0b\u9762\u7684&#8230;.\u540c\u6837\u4fdd\u7559\u4e86\u8bd5\u9519\u7684\u5185\u5bb9\u3002<\/p>\n<p>&nbsp;<\/p>\n<pre class=\"brush: scala; title: ; notranslate\" title=\"\">\r\nscala&gt; var rdd1 = sc.parallelizera(List(12,3,45,5,6,7,7))\r\n&lt;console&gt;:24: error: value parallelizera is not a member of org.apache.spark.SparkContext\r\n       var rdd1 = sc.parallelizera(List(12,3,45,5,6,7,7))\r\n                     ^\r\n\r\nscala&gt; var rdd1 = sc.parallelize(List(12,3,45,5,6,7,7))\r\nrdd1: org.apache.spark.rdd.RDD&#x5B;Int] = ParallelCollectionRDD&#x5B;0] at parallelize at &lt;console&gt;:24\r\n\r\nscala&gt; rdd1.collect\r\ncollect   collectAsync\r\n\r\nscala&gt; rdd1.collect\r\nres0: Array&#x5B;Int] = Array(12, 3, 45, 5, 6, 7, 7)                                 \r\n\r\nscala&gt; var rdd2 = rdd1.map(_*2)\r\nrdd2: org.apache.spark.rdd.RDD&#x5B;Int] = MapPartitionsRDD&#x5B;1] at map at &lt;console&gt;:26\r\n\r\nscala&gt; rdd2.co\r\ncoalesce   collect   collectAsync   compute   context   copy   count   countApprox   countApproxDistinct   countAsync   countByValue   countByValueApprox\r\n\r\nscala&gt; rdd2.collect\r\nres1: Array&#x5B;Int] = Array(24, 6, 90, 10, 12, 14, 14)\r\n\r\nscala&gt; var rdd3 = rdd2.sortBy(x=&gt;x,true)\r\nrdd3: org.apache.spark.rdd.RDD&#x5B;Int] = MapPartitionsRDD&#x5B;6] at sortBy at &lt;console&gt;:28\r\n\r\nscala&gt; rdd3.collect\r\nres2: Array&#x5B;Int] = Array(6, 10, 12, 14, 14, 24, 90)                             \r\n\r\nscala&gt; var rdd3 = rdd2.sortBy(x=&gt;x,false)\r\nrdd3: org.apache.spark.rdd.RDD&#x5B;Int] = MapPartitionsRDD&#x5B;11] at sortBy at &lt;console&gt;:28\r\n\r\nscala&gt; rdd3.collect\r\nres3: Array&#x5B;Int] = Array(90, 24, 14, 14, 12, 10, 6)\r\n\r\nscala&gt; var rdd3 = rdd2.sortBy(x,x,false)\r\n&lt;console&gt;:28: error: not found: value x\r\n       var rdd3 = rdd2.sortBy(x,x,false)\r\n                              ^\r\n&lt;console&gt;:28: error: not found: value x\r\n       var rdd3 = rdd2.sortBy(x,x,false)\r\n                                ^\r\n&lt;console&gt;:28: error: type mismatch;\r\n found   : Boolean(false)\r\n required: Int\r\n       var rdd3 = rdd2.sortBy(x,x,false)\r\n                                  ^\r\n\r\nscala&gt; var rdd3 = rdd2.sortBy((_,)_,false)\r\n&lt;console&gt;:1: error: illegal start of simple expression\r\nvar rdd3 = rdd2.sortBy((_,)_,false)\r\n                          ^\r\n\r\nscala&gt; var rdd3 = rdd2.sortBy((_,_),false)\r\n&lt;console&gt;:28: error: missing parameter type for expanded function ((x$1, x$2) =&gt; scala.Tuple2(x$1, x$2))\r\nError occurred in an application involving default arguments.\r\n       var rdd3 = rdd2.sortBy((_,_),false)\r\n                               ^\r\n&lt;console&gt;:28: error: missing parameter type for expanded function ((x$1: &lt;error&gt;, x$2) =&gt; scala.Tuple2(x$1, x$2))\r\nError occurred in an application involving default arguments.\r\n       var rdd3 = rdd2.sortBy((_,_),false)\r\n                                 ^\r\n\r\nscala&gt; var rdd3 = rdd2.sortBy((__),false)\r\n&lt;console&gt;:28: error: not found: value __\r\nError occurred in an application involving default arguments.\r\n       var rdd3 = rdd2.sortBy((__),false)\r\n                               ^\r\nscala&gt; scala&gt; var rdd3 = rdd2.sortBy((x)=&gt;x,false)\r\n\r\n\/\/ Detected repl transcript. Paste more, or ctrl-D to finish.\r\n\r\nrdd3: org.apache.spark.rdd.RDD&#x5B;Int] = MapPartitionsRDD&#x5B;16] at sortBy at &lt;console&gt;:28\r\n\r\nscala&gt; rdd3.collect\r\nres4: Array&#x5B;Int] = Array(90, 24, 14, 14, 12, 10, 6)\r\n\r\n      ____              __\r\n     \/ __\/__  ___ _____\/ \/__\r\n    _\\ \\\/ _ \\\/ _ `\/ __\/  '_\/\r\n   \/___\/ .__\/\\_,_\/_\/ \/_\/\\_\\   version 2.1.0\r\n      \/_\/\r\n         \r\nUsing Scala version 2.11.8 (Java HotSpot(TM) 64-Bit Server VM, Java 1.8.0_144)\r\nType in expressions to have them evaluated.\r\nType :help for more information.\r\n\r\nscala&gt; var rdd1 = sc.parallelize(List(12,3,45,5,6,7,7))\r\nrdd1: org.apache.spark.rdd.RDD&#x5B;Int] = ParallelCollectionRDD&#x5B;0] at parallelize at &lt;console&gt;:24\r\n\r\nscala&gt; var rdd3 = rdd1.sortBy(x=&gt;x+3,false)\r\nrdd3: org.apache.spark.rdd.RDD&#x5B;Int] = MapPartitionsRDD&#x5B;5] at sortBy at &lt;console&gt;:26\r\n\r\nscala&gt; rdd3.collect\r\nres0: Array&#x5B;Int] = Array(45, 12, 7, 7, 6, 5, 3)\r\n\r\nscala&gt; var rdd4 = rdd3.filter(_&gt;6)\r\nrdd4: org.apache.spark.rdd.RDD&#x5B;Int] = MapPartitionsRDD&#x5B;6] at filter at &lt;console&gt;:28\r\n\r\nscala&gt; rdd4.collect\r\nres1: Array&#x5B;Int] = Array(45, 12, 7, 7)\r\n\r\nscala&gt; var rdd5 = rdd3.distinct\r\nrdd5: org.apache.spark.rdd.RDD&#x5B;Int] = MapPartitionsRDD&#x5B;9] at distinct at &lt;console&gt;:28\r\n\r\nscala&gt; rdd5.collect\r\nres2: Array&#x5B;Int] = Array(45, 6, 3, 12, 7, 5)                                    \r\n\r\nscala&gt; var rdd1 = sc.parallelize(List((&quot;Tom&quot;,1000),(&quot;Mary&quot;,999),(&quot;TTT&quot;,0900)))\r\n&lt;console&gt;:1: error: Decimal integer literals may not have a leading zero. (Octal syntax is obsolete.)\r\nvar rdd1 = sc.parallelize(List((&quot;Tom&quot;,1000),(&quot;Mary&quot;,999),(&quot;TTT&quot;,0900)))\r\n                                                                ^\r\n\r\nscala&gt; var rdd1 = sc.parallelize(List((&quot;Tom&quot;,1000),(&quot;Mary&quot;,999),(&quot;TTT&quot;,900)))\r\nrdd1: org.apache.spark.rdd.RDD&#x5B;(String, Int)] = ParallelCollectionRDD&#x5B;10] at parallelize at &lt;console&gt;:24\r\n\r\nscala&gt; var rdd2 = sc.parallelize(List((&quot;Tom&quot;,1000),(&quot;Mary&quot;,999),(&quot;666&quot;,900)))\r\nrdd2: org.apache.spark.rdd.RDD&#x5B;(String, Int)] = ParallelCollectionRDD&#x5B;11] at parallelize at &lt;console&gt;:24\r\n\r\nscala&gt; var rdd2 = sc.parallelize(List((&quot;Tom&quot;,1050),(&quot;Mary&quot;,888),(&quot;666&quot;,900)))\r\nrdd2: org.apache.spark.rdd.RDD&#x5B;(String, Int)] = ParallelCollectionRDD&#x5B;12] at parallelize at &lt;console&gt;:24\r\n\r\nscala&gt; rdd3 = rdd1.union(rdd2)\r\n&lt;console&gt;:30: error: type mismatch;\r\n found   : org.apache.spark.rdd.RDD&#x5B;(String, Int)]\r\n required: org.apache.spark.rdd.RDD&#x5B;Int]\r\n       rdd3 = rdd1.union(rdd2)\r\n                         ^\r\n\r\nscala&gt; var rdd3 = rdd1.union(rdd2)\r\nrdd3: org.apache.spark.rdd.RDD&#x5B;(String, Int)] = UnionRDD&#x5B;13] at union at &lt;console&gt;:28\r\n\r\nscala&gt; rdd3.collect\r\nres3: Array&#x5B;(String, Int)] = Array((Tom,1000), (Mary,999), (TTT,900), (Tom,1050), (Mary,888), (666,900))\r\n\r\nscala&gt; var rdd4 = rdd3.groupByKey()\r\nrdd4: org.apache.spark.rdd.RDD&#x5B;(String, Iterable&#x5B;Int])] = ShuffledRDD&#x5B;14] at groupByKey at &lt;console&gt;:30\r\n\r\nscala&gt; rdd4.collect\r\nres4: Array&#x5B;(String, Iterable&#x5B;Int])] = Array((TTT,CompactBuffer(900)), (666,CompactBuffer(900)), (Mary,CompactBuffer(999, 888)), (Tom,CompactBuffer(1000, 1050)))\r\n\r\nscala&gt; var rdd5 = rdd3.reduceByKey(_+_)\r\nrdd5: org.apache.spark.rdd.RDD&#x5B;(String, Int)] = ShuffledRDD&#x5B;15] at reduceByKey at &lt;console&gt;:30\r\n\r\nscala&gt; rdd5.collect\r\nres5: Array&#x5B;(String, Int)] = Array((TTT,900), (666,900), (Mary,1887), (Tom,2050))\r\n\r\nscala&gt; reducebykey\u5e95\u5c42\u8c03\u7528\u7684groupbykey\r\n&lt;console&gt;:24: error: not found: value reducebykey\u5e95\u5c42\u8c03\u7528\u7684groupbykey\r\n       reducebykey\u5e95\u5c42\u8c03\u7528\u7684groupbykey\r\n       ^\r\n\r\nWelcome to\r\n      ____              __\r\n     \/ __\/__  ___ _____\/ \/__\r\n    _\\ \\\/ _ \\\/ _ `\/ __\/  '_\/\r\n   \/___\/ .__\/\\_,_\/_\/ \/_\/\\_\\   version 2.1.0\r\n      \/_\/\r\n         \r\nUsing Scala version 2.11.8 (Java HotSpot(TM) 64-Bit Server VM, Java 1.8.0_144)\r\nType in expressions to have them evaluated.\r\nType :help for more information.\r\n\r\nscala&gt; var rdd1 = sc.parallelize(List((&quot;Tom&quot;,1000),(&quot;Mary&quot;,999),(&quot;TTT&quot;,900)))\r\nrdd1: org.apache.spark.rdd.RDD&#x5B;(String, Int)] = ParallelCollectionRDD&#x5B;0] at parallelize at &lt;console&gt;:24\r\n\r\nscala&gt; var rdd2 = sc.parallelize(List((&quot;Tom&quot;,1000),(&quot;Mary&quot;,999),(&quot;666&quot;,900)))\r\nrdd2: org.apache.spark.rdd.RDD&#x5B;(String, Int)] = ParallelCollectionRDD&#x5B;1] at parallelize at &lt;console&gt;:24\r\n\r\nscala&gt; var rdd3 = rdd1.cogroup(rdd2)\r\nrdd3: org.apache.spark.rdd.RDD&#x5B;(String, (Iterable&#x5B;Int], Iterable&#x5B;Int]))] = MapPartitionsRDD&#x5B;3] at cogroup at &lt;console&gt;:28\r\n\r\nscala&gt; rdd3.collect\r\nres0: Array&#x5B;(String, (Iterable&#x5B;Int], Iterable&#x5B;Int]))] = Array((TTT,(CompactBuffer(900),CompactBuffer())), (666,(CompactBuffer(),CompactBuffer(900))), (Tom,(CompactBuffer(1000),CompactBuffer(1000))), (Mary,(CompactBuffer(999),CompactBuffer(999))))\r\n\r\nscala&gt; var rdd2 = sc.parallelize(List((&quot;Tom&quot;,1000),(&quot;Mary&quot;,999),(&quot;Tom&quot;,888),(&quot;666&quot;,900)))\r\nrdd2: org.apache.spark.rdd.RDD&#x5B;(String, Int)] = ParallelCollectionRDD&#x5B;4] at parallelize at &lt;console&gt;:24\r\n\r\nscala&gt; var rdd3 = rdd1.cogroup(rdd2)\r\nrdd3: org.apache.spark.rdd.RDD&#x5B;(String, (Iterable&#x5B;Int], Iterable&#x5B;Int]))] = MapPartitionsRDD&#x5B;6] at cogroup at &lt;console&gt;:28\r\n\r\nscala&gt; rdd3.collect\r\nres1: Array&#x5B;(String, (Iterable&#x5B;Int], Iterable&#x5B;Int]))] = Array((TTT,(CompactBuffer(900),CompactBuffer())), (666,(CompactBuffer(),CompactBuffer(900))), (Tom,(CompactBuffer(1000),CompactBuffer(888, 1000))), (Mary,(CompactBuffer(999),CompactBuffer(999))))\r\n\r\nscala&gt; var rdd4 = rdd1.sortBykey(true)\r\n&lt;console&gt;:26: error: value sortBykey is not a member of org.apache.spark.rdd.RDD&#x5B;(String, Int)]\r\n       var rdd4 = rdd1.sortBykey(true)\r\n                       ^\r\n\r\nscala&gt; var rdd4 = rdd1.sortByKey(true)\r\nrdd4: org.apache.spark.rdd.RDD&#x5B;(String, Int)] = ShuffledRDD&#x5B;9] at sortByKey at &lt;console&gt;:26\r\n\r\nscala&gt; rdd4.collect\r\nres2: Array&#x5B;(String, Int)] = Array((Mary,999), (TTT,900), (Tom,1000))\r\n\r\nscala&gt; rdd5 = rdd1.union(rdd2)\r\n&lt;console&gt;:30: error: not found: value rdd5\r\nval $ires6 = rdd5\r\n             ^\r\n&lt;console&gt;:28: error: not found: value rdd5\r\n       rdd5 = rdd1.union(rdd2)\r\n       ^\r\n\r\nscala&gt; var rdd5 = rdd1.union(rdd2)\r\nrdd5: org.apache.spark.rdd.RDD&#x5B;(String, Int)] = UnionRDD&#x5B;10] at union at &lt;console&gt;:28\r\n\r\nscala&gt; rdd5.collect\r\nres3: Array&#x5B;(String, Int)] = Array((Tom,1000), (Mary,999), (TTT,900), (Tom,1000), (Mary,999), (Tom,888), (666,900))\r\n\r\nscala&gt; var rdd6 = rdd5.map(x=&gt;(x._2,x._1)).sortByKey(true).map(x=&gt;(x._2,x._1))\r\nrdd6: org.apache.spark.rdd.RDD&#x5B;(String, Int)] = MapPartitionsRDD&#x5B;15] at map at &lt;console&gt;:30\r\n\r\nscala&gt; rdd6.collect\r\nres4: Array&#x5B;(String, Int)] = Array((Tom,888), (666,900), (TTT,900), (Mary,999), (Mary,999), (Tom,1000), (Tom,1000))\r\n\r\nscala&gt; var rdd7 = rdd5.sortBy(x=&gt;x._2,true)\r\nrdd7: org.apache.spark.rdd.RDD&#x5B;(String, Int)] = MapPartitionsRDD&#x5B;20] at sortBy at &lt;console&gt;:30\r\n\r\nscala&gt; rdd7.collect\r\nres5: Array&#x5B;(String, Int)] = Array((Tom,888), (TTT,900), (666,900), (Mary,999), (Mary,999), (Tom,1000), (Tom,1000))\r\n\r\n\u660e\u5929\u518d\u8865\u4e00\u90e8\u5206\r\n\r\n\r\n\r\n\r\n<\/pre>\n<p>&nbsp;<\/p>\n<p>&nbsp;<\/p>\n<p>&nbsp;<\/p>\n<p>&nbsp;<\/p>\n<p>&nbsp;<\/p>\n<p>&nbsp;<\/p>\n<p>&nbsp;<\/p>\n<p>&nbsp;<\/p>\n<p>&nbsp;<\/p>\n<p>&nbsp;<\/p>\n","protected":false},"excerpt":{"rendered":"<p>\u79c9\u627f\u5f52\u96f6\u5fc3\u6001\u662f\u5b66\u4e60\u7f16\u7a0b\u5fc5\u5907\u6001\u5ea6\u7684\u539f\u5219\uff0c\u8fd9\u6b21scala\u4e5f\u4f7f\u7528python\u7684\u65b9\u6cd5\uff0c\u628a\u4ee3\u7801\u6572\u4e00\u904d\uff0c\u6240\u4ee5\uff0c\u5c31\u6709\u4e86\u4e0b\u9762\u7684 [&hellip;]<\/p>\n","protected":false},"author":1,"featured_media":0,"comment_status":"open","ping_status":"open","sticky":false,"template":"","format":"standard","meta":{"footnotes":""},"categories":[1],"tags":[20],"class_list":["post-1357","post","type-post","status-publish","format-standard","hentry","category-uncategorized","tag-python"],"blocksy_meta":[],"_links":{"self":[{"href":"https:\/\/notes.coremix.net\/index.php?rest_route=\/wp\/v2\/posts\/1357","targetHints":{"allow":["GET"]}}],"collection":[{"href":"https:\/\/notes.coremix.net\/index.php?rest_route=\/wp\/v2\/posts"}],"about":[{"href":"https:\/\/notes.coremix.net\/index.php?rest_route=\/wp\/v2\/types\/post"}],"author":[{"embeddable":true,"href":"https:\/\/notes.coremix.net\/index.php?rest_route=\/wp\/v2\/users\/1"}],"replies":[{"embeddable":true,"href":"https:\/\/notes.coremix.net\/index.php?rest_route=%2Fwp%2Fv2%2Fcomments&post=1357"}],"version-history":[{"count":5,"href":"https:\/\/notes.coremix.net\/index.php?rest_route=\/wp\/v2\/posts\/1357\/revisions"}],"predecessor-version":[{"id":1368,"href":"https:\/\/notes.coremix.net\/index.php?rest_route=\/wp\/v2\/posts\/1357\/revisions\/1368"}],"wp:attachment":[{"href":"https:\/\/notes.coremix.net\/index.php?rest_route=%2Fwp%2Fv2%2Fmedia&parent=1357"}],"wp:term":[{"taxonomy":"category","embeddable":true,"href":"https:\/\/notes.coremix.net\/index.php?rest_route=%2Fwp%2Fv2%2Fcategories&post=1357"},{"taxonomy":"post_tag","embeddable":true,"href":"https:\/\/notes.coremix.net\/index.php?rest_route=%2Fwp%2Fv2%2Ftags&post=1357"}],"curies":[{"name":"wp","href":"https:\/\/api.w.org\/{rel}","templated":true}]}}