{"id":1440,"date":"2022-06-15T23:22:08","date_gmt":"2022-06-15T15:22:08","guid":{"rendered":"http:\/\/www.eait.co\/?p=1440"},"modified":"2022-06-15T23:22:59","modified_gmt":"2022-06-15T15:22:59","slug":"spark-cache-checkpoints-mappartitionswithindex-aggregate","status":"publish","type":"post","link":"https:\/\/notes.coremix.net\/?p=1440","title":{"rendered":"spark cache checkpoints mapPartitionsWithIndex aggregate"},"content":{"rendered":"<p>&nbsp;<\/p>\n<p>&nbsp;<\/p>\n<pre class=\"brush: scala; title: ; notranslate\" title=\"\">\r\nscala&gt; var rdd1 = sc.textFile(&quot;hdfs:\/\/bigdata111:9000\/spark\/test_Cache.txt&quot;)\r\nrdd1: org.apache.spark.rdd.RDD&#x5B;String] = hdfs:\/\/bigdata166:9000\/spark\/test_Cache.txt MapPartitionsRDD&#x5B;1] at textFile at &lt;console&gt;:24\r\n\r\nscala&gt; sc.setCheckpointDir(&quot;hdfs:\/\/bigdata166:9000\/spark\/&quot;)\r\n\r\nscala&gt; sc.setCheckpointDir(&quot;hdfs:\/\/bigdata166:9000\/spark\/checkpoint&quot;)\r\n\r\nscala&gt; rdd1.count\r\nscala&gt; rdd1.cache    \/\/\u7f13\u5b58\r\nscala&gt; rdd1.count\r\nres2: Long = 1846904 \r\n\r\n\/\/\r\nscala&gt; def fun1(index:Int,itea:Iterator&#x5B;Int]):Iterator&#x5B;String] = {\r\n     | itea.toList.map(x =&gt; &quot;&#x5B;partid:&quot; + index +&quot;, value=&quot;+x+&quot;]&quot; ).iterator\r\n     | }\r\nfun1: (index: Int, itea: Iterator&#x5B;Int])Iterator&#x5B;String]\r\n\r\nscala&gt; rdd1.mapPartitions\r\nmapPartitions   mapPartitionsWithIndex\r\n\r\nscala&gt; rdd1.mapPartitionsWithIndex(fun1).collect\r\nres3: Array&#x5B;String] = Array(\r\n    &#x5B;partid:0, value=1], &#x5B;partid:0, value=2], &#x5B;partid:0, value=3], \r\n    &#x5B;partid:1, value=4], &#x5B;partid:1, value=5], &#x5B;partid:1, value=6], \r\n    &#x5B;partid:2, value=7], &#x5B;partid:2, value=8], &#x5B;partid:2, value=9])\r\n\r\n<\/pre>\n<p>&nbsp;<\/p>\n<p class=\"md-focus-p\"><span class=\"md-line md-end-block md-focus\" contenteditable=\"true\"><span class=\"md-expand\"><strong>aggregate<\/strong><\/span>\uff1a\u805a\u5408\uff0c<\/span><\/p>\n<p><span class=\"md-line md-end-block\" contenteditable=\"true\">\u5148\u5c40\u90e8\u805a\u5408\uff0c\u7136\u540e\u5168\u5c40\u805a\u5408<\/span><\/p>\n<p class=\"\"><span class=\"md-line md-end-block\" contenteditable=\"true\"><span class=\"\">\u6c42\u6bcf\u4e2a\u5206\u533a\u7684\u6700\u5927\u503c\uff0c\u7136\u540e\u6c42\u548c<\/span><\/span><\/p>\n<p><span class=\"md-line md-end-block\" contenteditable=\"true\">\u73b0\u6c42\u6bcf\u4e2a\u5206\u533a\u7684\u6700\u5927\u503c<\/span><\/p>\n<p><span class=\"md-line md-end-block\" contenteditable=\"true\">\u6c42\u548c<\/span><\/p>\n<p><span class=\"md-line md-end-block\" contenteditable=\"true\">zeroValue: U\uff1a\u521d\u59cb\u5316\uff0c\u9700\u8981\u8d4b\u503c\uff1a\u521d\u59cb\u503c\u5728\u5c40\u90e8\u64cd\u4f5c\u8d77\u4f5c\u7528\uff0c\u5168\u5c40\u64cd\u4f5c\u4e5f\u8d77\u4f5c\u7528<\/span><\/p>\n<p><span class=\"md-line md-end-block\" contenteditable=\"true\">\u540e\u9762\u4e24\u4e2a\u51fd\u6570\u53c2\u6570<\/span><\/p>\n<p class=\"\"><span class=\"md-line md-end-block\" contenteditable=\"true\"><span class=\"\">\u7b2c\u4e00\u4e2a\u51fd\u6570\uff1a\u8868\u793a\u5c40\u90e8\u64cd\u4f5c<\/span><\/span><\/p>\n<p class=\"\"><span class=\"md-line md-end-block\" contenteditable=\"true\">\u7b2c\u4e8c\u4e2a\uff1a\u8868\u793a\u5168\u5c40\u64cd\u4f5c<\/span><\/p>\n<pre class=\"brush: scala; title: ; notranslate\" title=\"\">\r\nscala&gt; var rdd1 = sc.parallelize(List(1,2,3,4,5),2)\r\nrdd1: org.apache.spark.rdd.RDD&#x5B;Int] = ParallelCollectionRDD&#x5B;4] at parallelize at &lt;console&gt;:24\r\n\r\nscala&gt; rdd1.mapPartitionsWithIndex(fun1).collect\r\nres4: Array&#x5B;String] = Array(\r\n    &#x5B;partid:0, value=1], &#x5B;partid:0, value=2],\r\n    &#x5B;partid:1, value=3], &#x5B;partid:1, value=4], &#x5B;partid:1, value=5])\r\n\r\nscala&gt; rdd1.aggregate(0)(max(_,_),_+_)\r\nres5: Int = 7\r\nscala&gt; rdd1.aggregate(10)(max(_,_),_+_)\r\nres6: Int = 30\r\n\/\/\u7b2c\u4e00\u4e2a\u5206\u533a\u6570\u636e\uff1a10\uff08\u521d\u59cb\u503c\uff09\uff0c1\uff0c2\u2014\u2014\u2014\u201410\r\n\/\/\u7b2c\u4e8c\u4e2a\u5206\u533a\uff1a10\uff08\u521d\u59cb\u503c\uff09\uff0c3\uff0c4\uff0c5\u2014\u2014\u2014\u201410\r\n\/\/\u6c42\u548c\uff1a10\uff08\u521d\u59cb\u503c\uff09+10+10=30\r\n\r\n\/\/\u7b2c\u4e00\u4e2a\u5206\u533a\u7684\u6700\u5927\u503c\uff1a2\r\n\/\/\u4e8c\uff1a5\r\n\/\/\u6c42\u548c\uff1a2+5=7\r\n\r\nscala&gt; rdd1.aggregate(0)(_+_,_+_)\r\nres7: Int = 15\r\n\r\nscala&gt; rdd1.aggregate(10)(_+_,_+_)\r\nres8: Int = 45\r\n\r\n\/\/\u7b2c\u4e00\u4e2a\u5206\u533a\uff1a10+1+2=3\r\n\/\/\u7b2c\u4e8c\u4e2a\u5206\u533a\uff1a10+3+4+5=12\r\n\/\/\u6c42\u548c\uff1a10+3+12=15\r\n\r\n\/\/\u5176\u5b83\u64cd\u4f5c\uff1a&lt;\/pre&gt;\r\nscala&gt; var rdd1 = sc.parallelize(List(&quot;12&quot;,&quot;34&quot;,&quot;567&quot;,&quot;8901&quot;),2)\r\nrdd1: org.apache.spark.rdd.RDD&#x5B;String] = ParallelCollectionRDD&#x5B;0] at parallelize at &lt;console&gt;:24\r\n\r\nscala&gt; def fun1(index:Int,iter:Iterator&#x5B;String]):Iterator&#x5B;String]={\r\n| iter.toList.map(x =&gt; &quot;&#x5B;partID: &quot;+index+&quot;,value:&quot;+x+&quot;]&quot;).iterator}\r\nfun1: (index: Int, iter: Iterator&#x5B;String])Iterator&#x5B;String]\r\n\r\nscala&gt; rdd1.mapPartitionsWithIndex(fun1).collect\r\nres1: Array&#x5B;String] = Array(\r\n&#x5B;partID: 0,value:12], &#x5B;partID: 0,value:34],\r\n&#x5B;partID: 1,value:567], &#x5B;partID: 1,value:8901])\r\n\r\nscala&gt; rdd1.aggregate(&quot;&quot;)((x,y)=&gt;math.max(x.length,y.length).toString,(x,y)=&gt;x+y)\r\n\r\nscala&gt; rdd1.aggregate(&quot;&quot;)((x,y)=&gt;math.max(x.length,y.length).toString,(x,y)=&gt;x+y)\r\nres3: String = 42\r\n\r\nscala&gt; rdd1.aggregate(&quot;&quot;)((x,y)=&gt;math.max(x.length,y.length).toString,(x,y)=&gt;x+y)\r\nres4: String = 24\r\n\r\n\u5206\u6790\uff1a\r\n\u7b2c\u4e00\u4e2a\u5206\u533a\uff1a\u201c12\u201d\uff0c\u201c34\u201d\r\n\u7b2c\u4e00\u6b21\u6bd4\u8f83\uff1a\u201c\u201d\uff0c\u201c12\u201d=2.toString ==\u300b \u201c2\u201d\r\n\u7b2c\u4e8c\u6b21\u6bd4\u8f83\uff1a\u201c2\u201d\uff0c\u201c34\u201d=2.toString ==\u300b \u201c2\u201d\r\n\u7b2c\u4e8c\u4e2a\u5206\u533a\uff1a\u201c567\u201d\uff0c\u201c8901\u201d\r\n\u7b2c\u4e00\u6b21\u6bd4\u8f83\uff1a\u201c\u201d\uff0c\u201c567\u201d=3.toString ==\u300b\u201c3\u201d\r\n\u7b2c\u4e8c\u6b21\u6bd4\u8f83\uff1a\u201c3\u201d\uff0c\u201c8901\u201d=4.toString ==\u300b \u201c4\u201d\r\n\u201c24\u201d\u6216\u8005\u201c42\u201d\r\n\r\nscala&gt; var rdd1 = sc.parallelize(List(&quot;12&quot;,&quot;23&quot;,&quot;345&quot;,&quot;&quot;),2)\r\nrdd1: org.apache.spark.rdd.RDD&#x5B;String] = ParallelCollectionRDD&#x5B;2] at parallelize at &lt;console&gt;:24\r\n\r\nscala&gt; rdd1.mapPartitionsWithIndex(fun1).collect\r\nres6: Array&#x5B;String] = Array(&#x5B;partID: 0,value:12], &#x5B;partID: 0,value:23], &#x5B;partID: 1,value:345], &#x5B;partID: 1,value:])\r\n\r\nscala&gt; rdd1.aggregate(&quot;&quot;)((x,y)=&gt;math.min(x.length,y.length).toString,(x,y)=&gt;x+y)\r\nres7: String = 10\r\n\r\nscala&gt; rdd1.aggregate(&quot;&quot;)((x,y)=&gt;math.min(x.length,y.length).toString,(x,y)=&gt;x+y)\r\nres9: String = 01\r\n\r\n\u5206\u6790\uff1a\r\n\u7b2c\u4e00\u4e2a\u5206\u533a\uff1a\u201c12\u201d\uff0c\u201c34\u201d\r\n\u7b2c\u4e00\u6b21\u6bd4\u8f83\uff1a\u201c\u201d\uff0c\u201c12\u201d=0.toString ==\u300b \u201c0\u201d\r\n\u7b2c\u4e8c\u6b21\u6bd4\u8f83\uff1a\u201c0\u201d\uff0c\u201c34\u201d=1.toString ==\u300b \u201c1\u201d\r\n\u7b2c\u4e8c\u4e2a\u5206\u533a\uff1a\u201c345\u201d\uff0c\u201c\u201d\r\n\u7b2c\u4e00\u6b21\u6bd4\u8f83\uff1a\u201c\u201d\uff0c\u201c345\u201d=0.toString ==\u300b\u201c0\u201d\r\n\u7b2c\u4e8c\u6b21\u6bd4\u8f83\uff1a\u201c0\u201d\uff0c\u201c\u201d=0.toString ==\u300b \u201c0\u201d\r\n\u201c10\u201d\u6216\u8005\u201c01\u201d\r\n\r\nscala&gt; var rdd1 = sc.parallelize(List(&quot;12&quot;,&quot;23&quot;,&quot;&quot;,&quot;345&quot;),2)\r\nrdd1: org.apache.spark.rdd.RDD&#x5B;String] = ParallelCollectionRDD&#x5B;5] at parallelize at &lt;console&gt;:24\r\n\r\nscala&gt; rdd1.mapPartitionsWithIndex(fun1).collect\r\nres10: Array&#x5B;String] = Array(&#x5B;partID: 0,value:12], &#x5B;partID: 0,value:23], &#x5B;partID: 1,value:], &#x5B;partID: 1,value:345])\r\n\r\nscala&gt; rdd1.aggregate(&quot;&quot;)((x,y)=&gt;math.min(x.length,y.length).toString,(x,y)=&gt;x+y)\r\nres11: String = 11\r\n\r\n\u5206\u6790\uff1a\r\n\u7b2c\u4e00\u4e2a\u5206\u533a\uff1a\u201c12\u201d\uff0c\u201c34\u201d\r\n\u7b2c\u4e00\u6b21\u6bd4\u8f83\uff1a\u201c\u201d\uff0c\u201c12\u201d=0.toString ==\u300b \u201c0\u201d\r\n\u7b2c\u4e8c\u6b21\u6bd4\u8f83\uff1a\u201c0\u201d\uff0c\u201c34\u201d=1.toString ==\u300b \u201c1\u201d\r\n\u7b2c\u4e8c\u4e2a\u5206\u533a\uff1a\u201c\u201d\uff0c\u201c345\u201d\r\n\u7b2c\u4e00\u6b21\u6bd4\u8f83\uff1a\u201c\u201d\uff0c\u201c\u201d=0.toString ==\u300b\u201c0\u201d\r\n\u7b2c\u4e8c\u6b21\u6bd4\u8f83\uff1a\u201c0\u201d\uff0c\u201c345\u201d=1.toString ==\u300b \u201c1\u201d\r\n\u201c11\u201d\r\n<\/pre>\n<p>&nbsp;<\/p>\n","protected":false},"excerpt":{"rendered":"<p>&nbsp; &nbsp; scala&gt; var rdd1 = sc.textFile(&quot;hd [&hellip;]<\/p>\n","protected":false},"author":1,"featured_media":0,"comment_status":"open","ping_status":"open","sticky":false,"template":"","format":"standard","meta":{"footnotes":""},"categories":[30],"tags":[],"class_list":["post-1440","post","type-post","status-publish","format-standard","hentry","category-bigdata"],"blocksy_meta":[],"_links":{"self":[{"href":"https:\/\/notes.coremix.net\/index.php?rest_route=\/wp\/v2\/posts\/1440","targetHints":{"allow":["GET"]}}],"collection":[{"href":"https:\/\/notes.coremix.net\/index.php?rest_route=\/wp\/v2\/posts"}],"about":[{"href":"https:\/\/notes.coremix.net\/index.php?rest_route=\/wp\/v2\/types\/post"}],"author":[{"embeddable":true,"href":"https:\/\/notes.coremix.net\/index.php?rest_route=\/wp\/v2\/users\/1"}],"replies":[{"embeddable":true,"href":"https:\/\/notes.coremix.net\/index.php?rest_route=%2Fwp%2Fv2%2Fcomments&post=1440"}],"version-history":[{"count":2,"href":"https:\/\/notes.coremix.net\/index.php?rest_route=\/wp\/v2\/posts\/1440\/revisions"}],"predecessor-version":[{"id":1442,"href":"https:\/\/notes.coremix.net\/index.php?rest_route=\/wp\/v2\/posts\/1440\/revisions\/1442"}],"wp:attachment":[{"href":"https:\/\/notes.coremix.net\/index.php?rest_route=%2Fwp%2Fv2%2Fmedia&parent=1440"}],"wp:term":[{"taxonomy":"category","embeddable":true,"href":"https:\/\/notes.coremix.net\/index.php?rest_route=%2Fwp%2Fv2%2Fcategories&post=1440"},{"taxonomy":"post_tag","embeddable":true,"href":"https:\/\/notes.coremix.net\/index.php?rest_route=%2Fwp%2Fv2%2Ftags&post=1440"}],"curies":[{"name":"wp","href":"https:\/\/api.w.org\/{rel}","templated":true}]}}