Aache Spark java.lang.ArrayIndexOutOfBoundsException: 3 -
here files:
package org.apache.spark.rdd; import java.util.list; import org.apache.spark.sparkconf; import org.apache.spark.sparkcontext; import org.apache.spark.api.java.javapairrdd; import org.apache.spark.api.java.javardd; import org.apache.spark.api.java.javasparkcontext; import org.apache.spark.api.java.function.function; import org.apache.spark.api.java.function.function2; import org.apache.spark.api.java.function.pairfunction; import scala.tuple2; public class datapreperation { public static void main(string[] args) { sparkconf config = new sparkconf().setmaster("local").setappname("datapreperation"); javasparkcontext sc = new javasparkcontext(config); javardd<string> custrdd = sc.textfile("data/customer.csv"); javardd<string> transrdd = sc.textfile("data/transection.csv"); ////identify distinct rows in customer.csv javapairrdd<string, string> custkp = custrdd.maptopair(new pairfunction<string, string, string>() { public tuple2<string, string> call(string x) throws exception { // todo auto-generated method stub return new tuple2(x.split(",")[0],x); } }); //system.out.println(custkp.count()+"all rows 25"); //system.out.println(custkp.keys().distinct()+"distinct rows 25"); javapairrdd<string, string> custkpreduced = custkp.reducebykey(new function2<string, string, string>() { public string call(string x, string y) throws exception { // todo auto-generated method stub`` return y; } }); //system.out.println(custkpreduced.count()+"distinct rows 21"); //system.out.println(custkpreduced.collect()); javapairrdd<string, string> transkp = transrdd.maptopair(new pairfunction<string, string, string>() { public tuple2<string, string> call(string x) throws exception { // todo auto-generated method stub return new tuple2(x.split(",")[1], x); } }); javapairrdd<string, string> transkpdist = transkp.reducebykey(new function2<string, string, string>() { public string call(string x, string y) throws exception { // todo auto-generated method stub return y; } }); javapairrdd<string, tuple2<string, string>> custtranskp= custkpreduced.join(transkpdist); //system.out.println(custtranskp.count()); // system.out.println(custkpreduced.take(10)) // system.out.println("customer distinct rows key :"+custkpreduced.count()); // system.out.println("total joined table rows : "+custtranskp.count()); // system.out.println("distinct joined table rows :"+custtranskp.distinct().count()); // system.out.println("transaction total rows + distinct rows:"+transkp.count()+" +" +transkp.distinct().count()); // javardd<string> subkeys = custkpreduced.subtractbykey(custtranskp).keys(); // system.out.println(subkeys.distinct().count()); // javardd<string> totalcustkeys = custtranskp.distinct().keys();//22797 // javardd<string> totalkeys = subkeys.union(totalcustkeys); // system.out.println(totalkeys.count()); // totalkeys.coalesce(1).saveastextfile("data/total_keys"); // //system.out.println(custtranskp.take(1)); //javardd<string> transkeys = transkp.distinct().keys(); javardd<tuple2<string, string>> transid=custtranskp.values(); javardd<string> transkey = transid.map(new function<tuple2<string,string>, string>() { public string call(tuple2<string, string> x) throws exception { // todo auto-generated method stub return x._1().split(",")[3];//here if change [3] [2] or [1] not showing me exception. } }); custtranskp.coalesce(1).saveastextfile("data/custtranskp"); transid.coalesce(1).saveastextfile("data/transid"); transkey.coalesce(1).saveastextfile("data/trans_key"); //javardd<string> transkey = //system.out.println("count of tanrskey:"+transkey.count()); //system.out.println("first 10: "+transkey.take(10)); } } here output :
16/01/06 09:05:05 error executor: exception in task 0.0 in stage 8.0 (tid 4) java.lang.arrayindexoutofboundsexception: 3 @ org.apache.spark.rdd.datapreperation$5.call(datapreperation.java:93) @ org.apache.spark.rdd.datapreperation$5.call(datapreperation.java:1) @ org.apache.spark.api.java.javapairrdd$$anonfun$toscalafunction$1.apply(javapairrdd.scala:1027) @ scala.collection.iterator$$anon$11.next(iterator.scala:328) @ scala.collection.iterator$$anon$13.next(iterator.scala:372) @ scala.collection.iterator$$anon$11.next(iterator.scala:328) @ org.apache.spark.rdd.pairrddfunctions$$anonfun$saveashadoopdataset$1$$anonfun$13$$anonfun$apply$6.apply$mcv$sp(pairrddfunctions.scala:1109) @ org.apache.spark.rdd.pairrddfunctions$$anonfun$saveashadoopdataset$1$$anonfun$13$$anonfun$apply$6.apply(pairrddfunctions.scala:1108) @ org.apache.spark.rdd.pairrddfunctions$$anonfun$saveashadoopdataset$1$$anonfun$13$$anonfun$apply$6.apply(pairrddfunctions.scala:1108) @ org.apache.spark.util.utils$.trywithsafefinally(utils.scala:1206) @ org.apache.spark.rdd.pairrddfunctions$$anonfun$saveashadoopdataset$1$$anonfun$13.apply(pairrddfunctions.scala:1116) @ org.apache.spark.rdd.pairrddfunctions$$anonfun$saveashadoopdataset$1$$anonfun$13.apply(pairrddfunctions.scala:1095) @ org.apache.spark.scheduler.resulttask.runtask(resulttask.scala:66) @ org.apache.spark.scheduler.task.run(task.scala:88) @ org.apache.spark.executor.executor$taskrunner.run(executor.scala:214) @ java.util.concurrent.threadpoolexecutor.runworker(threadpoolexecutor.java:1145) @ java.util.concurrent.threadpoolexecutor$worker.run(threadpoolexecutor.java:615) @ java.lang.thread.run(thread.java:745) 16/01/06 09:05:05 warn tasksetmanager: lost task 0.0 in stage 8.0 (tid 4, localhost): java.lang.arrayindexoutofboundsexception: 3 @ org.apache.spark.rdd.datapreperation$5.call(datapreperation.java:93) @ org.apache.spark.rdd.datapreperation$5.call(datapreperation.java:1) @ org.apache.spark.api.java.javapairrdd$$anonfun$toscalafunction$1.apply(javapairrdd.scala:1027) @ scala.collection.iterator$$anon$11.next(iterator.scala:328) @ scala.collection.iterator$$anon$13.next(iterator.scala:372) @ scala.collection.iterator$$anon$11.next(iterator.scala:328) @ org.apache.spark.rdd.pairrddfunctions$$anonfun$saveashadoopdataset$1$$anonfun$13$$anonfun$apply$6.apply$mcv$sp(pairrddfunctions.scala:1109) @ org.apache.spark.rdd.pairrddfunctions$$anonfun$saveashadoopdataset$1$$anonfun$13$$anonfun$apply$6.apply(pairrddfunctions.scala:1108) @ org.apache.spark.rdd.pairrddfunctions$$anonfun$saveashadoopdataset$1$$anonfun$13$$anonfun$apply$6.apply(pairrddfunctions.scala:1108) @ org.apache.spark.util.utils$.trywithsafefinally(utils.scala:1206) @ org.apache.spark.rdd.pairrddfunctions$$anonfun$saveashadoopdataset$1$$anonfun$13.apply(pairrddfunctions.scala:1116) @ org.apache.spark.rdd.pairrddfunctions$$anonfun$saveashadoopdataset$1$$anonfun$13.apply(pairrddfunctions.scala:1095) @ org.apache.spark.scheduler.resulttask.runtask(resulttask.scala:66) @ org.apache.spark.scheduler.task.run(task.scala:88) @ org.apache.spark.executor.executor$taskrunner.run(executor.scala:214) @ java.util.concurrent.threadpoolexecutor.runworker(threadpoolexecutor.java:1145) @ java.util.concurrent.threadpoolexecutor$worker.run(threadpoolexecutor.java:615) @ java.lang.thread.run(thread.java:745) 16/01/06 09:05:05 error tasksetmanager: task 0 in stage 8.0 failed 1 times; aborting job 16/01/06 09:05:05 info taskschedulerimpl: removed taskset 8.0, tasks have completed, pool 16/01/06 09:05:05 info taskschedulerimpl: cancelling stage 8 16/01/06 09:05:05 info dagscheduler: resultstage 8 (main @ <unknown>:0) failed in 8.285 s 16/01/06 09:05:05 info dagscheduler: job 2 failed: main @ <unknown>:0, took 8.317993 s exception in thread "main" org.apache.spark.sparkexception: job aborted due stage failure: task 0 in stage 8.0 failed 1 times, recent failure: lost task 0.0 in stage 8.0 (tid 4, localhost): java.lang.arrayindexoutofboundsexception: 3 @ org.apache.spark.rdd.datapreperation$5.call(datapreperation.java:93) @ org.apache.spark.rdd.datapreperation$5.call(datapreperation.java:1) @ org.apache.spark.api.java.javapairrdd$$anonfun$toscalafunction$1.apply(javapairrdd.scala:1027) @ scala.collection.iterator$$anon$11.next(iterator.scala:328) @ scala.collection.iterator$$anon$13.next(iterator.scala:372) @ scala.collection.iterator$$anon$11.next(iterator.scala:328) @ org.apache.spark.rdd.pairrddfunctions$$anonfun$saveashadoopdataset$1$$anonfun$13$$anonfun$apply$6.apply$mcv$sp(pairrddfunctions.scala:1109) @ org.apache.spark.rdd.pairrddfunctions$$anonfun$saveashadoopdataset$1$$anonfun$13$$anonfun$apply$6.apply(pairrddfunctions.scala:1108) @ org.apache.spark.rdd.pairrddfunctions$$anonfun$saveashadoopdataset$1$$anonfun$13$$anonfun$apply$6.apply(pairrddfunctions.scala:1108) @ org.apache.spark.util.utils$.trywithsafefinally(utils.scala:1206) @ org.apache.spark.rdd.pairrddfunctions$$anonfun$saveashadoopdataset$1$$anonfun$13.apply(pairrddfunctions.scala:1116) @ org.apache.spark.rdd.pairrddfunctions$$anonfun$saveashadoopdataset$1$$anonfun$13.apply(pairrddfunctions.scala:1095) @ org.apache.spark.scheduler.resulttask.runtask(resulttask.scala:66) @ org.apache.spark.scheduler.task.run(task.scala:88) @ org.apache.spark.executor.executor$taskrunner.run(executor.scala:214) @ java.util.concurrent.threadpoolexecutor.runworker(threadpoolexecutor.java:1145) @ java.util.concurrent.threadpoolexecutor$worker.run(threadpoolexecutor.java:615) @ java.lang.thread.run(thread.java:745) driver stacktrace: @ org.apache.spark.scheduler.dagscheduler.org$apache$spark$scheduler$dagscheduler$$failjobandindependentstages(dagscheduler.scala:1280) @ org.apache.spark.scheduler.dagscheduler$$anonfun$abortstage$1.apply(dagscheduler.scala:1268) @ org.apache.spark.scheduler.dagscheduler$$anonfun$abortstage$1.apply(dagscheduler.scala:1267) @ scala.collection.mutable.resizablearray$class.foreach(resizablearray.scala:59) @ scala.collection.mutable.arraybuffer.foreach(arraybuffer.scala:47) @ org.apache.spark.scheduler.dagscheduler.abortstage(dagscheduler.scala:1267) @ org.apache.spark.scheduler.dagscheduler$$anonfun$handletasksetfailed$1.apply(dagscheduler.scala:697) @ org.apache.spark.scheduler.dagscheduler$$anonfun$handletasksetfailed$1.apply(dagscheduler.scala:697) @ scala.option.foreach(option.scala:236) @ org.apache.spark.scheduler.dagscheduler.handletasksetfailed(dagscheduler.scala:697) @ org.apache.spark.scheduler.dagschedulereventprocessloop.doonreceive(dagscheduler.scala:1493) @ org.apache.spark.scheduler.dagschedulereventprocessloop.onreceive(dagscheduler.scala:1455) @ org.apache.spark.scheduler.dagschedulereventprocessloop.onreceive(dagscheduler.scala:1444) @ org.apache.spark.util.eventloop$$anon$1.run(eventloop.scala:48) @ org.apache.spark.scheduler.dagscheduler.runjob(dagscheduler.scala:567) @ org.apache.spark.sparkcontext.runjob(sparkcontext.scala:1813) @ org.apache.spark.sparkcontext.runjob(sparkcontext.scala:1826) @ org.apache.spark.sparkcontext.runjob(sparkcontext.scala:1903) @ org.apache.spark.rdd.pairrddfunctions$$anonfun$saveashadoopdataset$1.apply$mcv$sp(pairrddfunctions.scala:1124) @ org.apache.spark.rdd.pairrddfunctions$$anonfun$saveashadoopdataset$1.apply(pairrddfunctions.scala:1065) @ org.apache.spark.rdd.pairrddfunctions$$anonfun$saveashadoopdataset$1.apply(pairrddfunctions.scala:1065) @ org.apache.spark.rdd.rddoperationscope$.withscope(rddoperationscope.scala:147) @ org.apache.spark.rdd.rddoperationscope$.withscope(rddoperationscope.scala:108) @ org.apache.spark.rdd.rdd.withscope(rdd.scala:306) @ org.apache.spark.rdd.pairrddfunctions.saveashadoopdataset(pairrddfunctions.scala:1065) @ org.apache.spark.rdd.pairrddfunctions$$anonfun$saveashadoopfile$4.apply$mcv$sp(pairrddfunctions.scala:989) @ org.apache.spark.rdd.pairrddfunctions$$anonfun$saveashadoopfile$4.apply(pairrddfunctions.scala:965) @ org.apache.spark.rdd.pairrddfunctions$$anonfun$saveashadoopfile$4.apply(pairrddfunctions.scala:965) @ org.apache.spark.rdd.rddoperationscope$.withscope(rddoperationscope.scala:147) @ org.apache.spark.rdd.rddoperationscope$.withscope(rddoperationscope.scala:108) @ org.apache.spark.rdd.rdd.withscope(rdd.scala:306) @ org.apache.spark.rdd.pairrddfunctions.saveashadoopfile(pairrddfunctions.scala:965) @ org.apache.spark.rdd.pairrddfunctions$$anonfun$saveashadoopfile$1.apply$mcv$sp(pairrddfunctions.scala:897) @ org.apache.spark.rdd.pairrddfunctions$$anonfun$saveashadoopfile$1.apply(pairrddfunctions.scala:897) @ org.apache.spark.rdd.pairrddfunctions$$anonfun$saveashadoopfile$1.apply(pairrddfunctions.scala:897) @ org.apache.spark.rdd.rddoperationscope$.withscope(rddoperationscope.scala:147) @ org.apache.spark.rdd.rddoperationscope$.withscope(rddoperationscope.scala:108) @ org.apache.spark.rdd.rdd.withscope(rdd.scala:306) @ org.apache.spark.rdd.pairrddfunctions.saveashadoopfile(pairrddfunctions.scala:896) @ org.apache.spark.rdd.rdd$$anonfun$saveastextfile$1.apply$mcv$sp(rdd.scala:1426) @ org.apache.spark.rdd.rdd$$anonfun$saveastextfile$1.apply(rdd.scala:1405) @ org.apache.spark.rdd.rdd$$anonfun$saveastextfile$1.apply(rdd.scala:1405) @ org.apache.spark.rdd.rddoperationscope$.withscope(rddoperationscope.scala:147) @ org.apache.spark.rdd.rddoperationscope$.withscope(rddoperationscope.scala:108) @ org.apache.spark.rdd.rdd.withscope(rdd.scala:306) @ org.apache.spark.rdd.rdd.saveastextfile(rdd.scala:1405) @ org.apache.spark.api.java.javarddlike$class.saveastextfile(javarddlike.scala:522) @ org.apache.spark.api.java.abstractjavarddlike.saveastextfile(javarddlike.scala:47) @ org.apache.spark.rdd.datapreperation.main(datapreperation.java:98) caused by: java.lang.arrayindexoutofboundsexception: 3 @ org.apache.spark.rdd.datapreperation$5.call(datapreperation.java:93) @ org.apache.spark.rdd.datapreperation$5.call(datapreperation.java:1) @ org.apache.spark.api.java.javapairrdd$$anonfun$toscalafunction$1.apply(javapairrdd.scala:1027) @ scala.collection.iterator$$anon$11.next(iterator.scala:328) @ scala.collection.iterator$$anon$13.next(iterator.scala:372) @ scala.collection.iterator$$anon$11.next(iterator.scala:328) @ org.apache.spark.rdd.pairrddfunctions$$anonfun$saveashadoopdataset$1$$anonfun$13$$anonfun$apply$6.apply$mcv$sp(pairrddfunctions.scala:1109) @ org.apache.spark.rdd.pairrddfunctions$$anonfun$saveashadoopdataset$1$$anonfun$13$$anonfun$apply$6.apply(pairrddfunctions.scala:1108) @ org.apache.spark.rdd.pairrddfunctions$$anonfun$saveashadoopdataset$1$$anonfun$13$$anonfun$apply$6.apply(pairrddfunctions.scala:1108) @ org.apache.spark.util.utils$.trywithsafefinally(utils.scala:1206) @ org.apache.spark.rdd.pairrddfunctions$$anonfun$saveashadoopdataset$1$$anonfun$13.apply(pairrddfunctions.scala:1116) @ org.apache.spark.rdd.pairrddfunctions$$anonfun$saveashadoopdataset$1$$anonfun$13.apply(pairrddfunctions.scala:1095) @ org.apache.spark.scheduler.resulttask.runtask(resulttask.scala:66) @ org.apache.spark.scheduler.task.run(task.scala:88) @ org.apache.spark.executor.executor$taskrunner.run(executor.scala:214) @ java.util.concurrent.threadpoolexecutor.runworker(threadpoolexecutor.java:1145) @ java.util.concurrent.threadpoolexecutor$worker.run(threadpoolexecutor.java:615) @ java.lang.thread.run(thread.java:745) 16/01/06 09:05:05 info sparkcontext: invoking stop() shutdown hook 16/01/06 09:05:05 info sparkui: stopped spark web ui @ http://192.168.100.35:4040 16/01/06 09:05:05 info dagscheduler: stopping dagscheduler 16/01/06 09:05:05 info mapoutputtrackermasterendpoint: mapoutputtrackermasterendpoint stopped! 16/01/06 09:05:05 info memorystore: memorystore cleared 16/01/06 09:05:05 info blockmanager: blockmanager stopped 16/01/06 09:05:05 info blockmanagermaster: blockmanagermaster stopped 16/01/06 09:05:05 info outputcommitcoordinator$outputcommitcoordinatorendpoint: outputcommitcoordinator stopped! 16/01/06 09:05:05 info sparkcontext: stopped sparkcontext 16/01/06 09:05:05 info shutdownhookmanager: shutdown hook called 16/01/06 09:05:05 info shutdownhookmanager: deleting directory /tmp/spark-b90705cb-50d2-40fc-9518-e0aed907f570 transid rdd of valueof pairrdd-custtranskp consists of 2 files customer.csv , transaction.csv.
whenever try access elements of transaction.csv return x._1().split(",")[3]; throws exception not when turn x._1().split(",")[2];
look maybe split dosent work expect ,try using split(",", -1) empty elements beetwen separator keeped in final rdd , array have same number of elements.
split(",",-1) means keep empty values @ end. default split(regex, 0) discards empty values
Comments
Post a Comment