Aache Spark java.lang.ArrayIndexOutOfBoundsException: 3 -


here files:

package org.apache.spark.rdd;  import java.util.list;  import org.apache.spark.sparkconf; import org.apache.spark.sparkcontext; import org.apache.spark.api.java.javapairrdd; import org.apache.spark.api.java.javardd; import org.apache.spark.api.java.javasparkcontext; import org.apache.spark.api.java.function.function; import org.apache.spark.api.java.function.function2; import org.apache.spark.api.java.function.pairfunction;  import scala.tuple2;  public class datapreperation {      public static void main(string[] args) {         sparkconf config = new sparkconf().setmaster("local").setappname("datapreperation");         javasparkcontext sc = new javasparkcontext(config);          javardd<string> custrdd = sc.textfile("data/customer.csv");         javardd<string> transrdd = sc.textfile("data/transection.csv");           ////identify distinct rows in customer.csv         javapairrdd<string, string> custkp = custrdd.maptopair(new pairfunction<string, string, string>() {             public tuple2<string, string> call(string x) throws exception {                 // todo auto-generated method stub                 return new tuple2(x.split(",")[0],x);             }         });         //system.out.println(custkp.count()+"all rows 25");          //system.out.println(custkp.keys().distinct()+"distinct rows 25");           javapairrdd<string, string> custkpreduced = custkp.reducebykey(new function2<string, string, string>() {              public string call(string x, string y) throws exception {                 // todo auto-generated method stub``                 return y;             }         });          //system.out.println(custkpreduced.count()+"distinct rows 21");         //system.out.println(custkpreduced.collect());            javapairrdd<string, string> transkp = transrdd.maptopair(new pairfunction<string, string, string>() {             public tuple2<string, string> call(string x) throws exception {                 // todo auto-generated method stub                 return new tuple2(x.split(",")[1], x);             }         });          javapairrdd<string, string> transkpdist = transkp.reducebykey(new function2<string, string, string>() {             public string call(string x, string y) throws exception {                 // todo auto-generated method stub                 return y;             }         });            javapairrdd<string, tuple2<string, string>> custtranskp= custkpreduced.join(transkpdist);           //system.out.println(custtranskp.count());      //  system.out.println(custkpreduced.take(10))  //      system.out.println("customer distinct rows key :"+custkpreduced.count()); //      system.out.println("total joined table rows : "+custtranskp.count()); //      system.out.println("distinct joined table rows :"+custtranskp.distinct().count()); //      system.out.println("transaction total rows + distinct rows:"+transkp.count()+" +" +transkp.distinct().count());  //      javardd<string> subkeys = custkpreduced.subtractbykey(custtranskp).keys(); //      system.out.println(subkeys.distinct().count()); //      javardd<string> totalcustkeys = custtranskp.distinct().keys();//22797 //      javardd<string> totalkeys = subkeys.union(totalcustkeys); //      system.out.println(totalkeys.count()); //      totalkeys.coalesce(1).saveastextfile("data/total_keys"); //               //system.out.println(custtranskp.take(1));         //javardd<string> transkeys = transkp.distinct().keys();         javardd<tuple2<string, string>> transid=custtranskp.values();          javardd<string> transkey = transid.map(new function<tuple2<string,string>, string>() {             public string call(tuple2<string, string> x) throws exception {                 // todo auto-generated method stub                 return x._1().split(",")[3];//here if change [3] [2] or [1] not showing me exception.             }         });         custtranskp.coalesce(1).saveastextfile("data/custtranskp");         transid.coalesce(1).saveastextfile("data/transid");         transkey.coalesce(1).saveastextfile("data/trans_key");         //javardd<string> transkey =          //system.out.println("count of tanrskey:"+transkey.count());         //system.out.println("first 10: "+transkey.take(10));        }  } 

here output :

16/01/06 09:05:05 error executor: exception in task 0.0 in stage 8.0 (tid 4) java.lang.arrayindexoutofboundsexception: 3     @ org.apache.spark.rdd.datapreperation$5.call(datapreperation.java:93)     @ org.apache.spark.rdd.datapreperation$5.call(datapreperation.java:1)     @ org.apache.spark.api.java.javapairrdd$$anonfun$toscalafunction$1.apply(javapairrdd.scala:1027)     @ scala.collection.iterator$$anon$11.next(iterator.scala:328)     @ scala.collection.iterator$$anon$13.next(iterator.scala:372)     @ scala.collection.iterator$$anon$11.next(iterator.scala:328)     @ org.apache.spark.rdd.pairrddfunctions$$anonfun$saveashadoopdataset$1$$anonfun$13$$anonfun$apply$6.apply$mcv$sp(pairrddfunctions.scala:1109)     @ org.apache.spark.rdd.pairrddfunctions$$anonfun$saveashadoopdataset$1$$anonfun$13$$anonfun$apply$6.apply(pairrddfunctions.scala:1108)     @ org.apache.spark.rdd.pairrddfunctions$$anonfun$saveashadoopdataset$1$$anonfun$13$$anonfun$apply$6.apply(pairrddfunctions.scala:1108)     @ org.apache.spark.util.utils$.trywithsafefinally(utils.scala:1206)     @ org.apache.spark.rdd.pairrddfunctions$$anonfun$saveashadoopdataset$1$$anonfun$13.apply(pairrddfunctions.scala:1116)     @ org.apache.spark.rdd.pairrddfunctions$$anonfun$saveashadoopdataset$1$$anonfun$13.apply(pairrddfunctions.scala:1095)     @ org.apache.spark.scheduler.resulttask.runtask(resulttask.scala:66)     @ org.apache.spark.scheduler.task.run(task.scala:88)     @ org.apache.spark.executor.executor$taskrunner.run(executor.scala:214)     @ java.util.concurrent.threadpoolexecutor.runworker(threadpoolexecutor.java:1145)     @ java.util.concurrent.threadpoolexecutor$worker.run(threadpoolexecutor.java:615)     @ java.lang.thread.run(thread.java:745) 16/01/06 09:05:05 warn tasksetmanager: lost task 0.0 in stage 8.0 (tid 4, localhost): java.lang.arrayindexoutofboundsexception: 3     @ org.apache.spark.rdd.datapreperation$5.call(datapreperation.java:93)     @ org.apache.spark.rdd.datapreperation$5.call(datapreperation.java:1)     @ org.apache.spark.api.java.javapairrdd$$anonfun$toscalafunction$1.apply(javapairrdd.scala:1027)     @ scala.collection.iterator$$anon$11.next(iterator.scala:328)     @ scala.collection.iterator$$anon$13.next(iterator.scala:372)     @ scala.collection.iterator$$anon$11.next(iterator.scala:328)     @ org.apache.spark.rdd.pairrddfunctions$$anonfun$saveashadoopdataset$1$$anonfun$13$$anonfun$apply$6.apply$mcv$sp(pairrddfunctions.scala:1109)     @ org.apache.spark.rdd.pairrddfunctions$$anonfun$saveashadoopdataset$1$$anonfun$13$$anonfun$apply$6.apply(pairrddfunctions.scala:1108)     @ org.apache.spark.rdd.pairrddfunctions$$anonfun$saveashadoopdataset$1$$anonfun$13$$anonfun$apply$6.apply(pairrddfunctions.scala:1108)     @ org.apache.spark.util.utils$.trywithsafefinally(utils.scala:1206)     @ org.apache.spark.rdd.pairrddfunctions$$anonfun$saveashadoopdataset$1$$anonfun$13.apply(pairrddfunctions.scala:1116)     @ org.apache.spark.rdd.pairrddfunctions$$anonfun$saveashadoopdataset$1$$anonfun$13.apply(pairrddfunctions.scala:1095)     @ org.apache.spark.scheduler.resulttask.runtask(resulttask.scala:66)     @ org.apache.spark.scheduler.task.run(task.scala:88)     @ org.apache.spark.executor.executor$taskrunner.run(executor.scala:214)     @ java.util.concurrent.threadpoolexecutor.runworker(threadpoolexecutor.java:1145)     @ java.util.concurrent.threadpoolexecutor$worker.run(threadpoolexecutor.java:615)     @ java.lang.thread.run(thread.java:745)  16/01/06 09:05:05 error tasksetmanager: task 0 in stage 8.0 failed 1 times; aborting job 16/01/06 09:05:05 info taskschedulerimpl: removed taskset 8.0, tasks have completed, pool  16/01/06 09:05:05 info taskschedulerimpl: cancelling stage 8 16/01/06 09:05:05 info dagscheduler: resultstage 8 (main @ <unknown>:0) failed in 8.285 s 16/01/06 09:05:05 info dagscheduler: job 2 failed: main @ <unknown>:0, took 8.317993 s exception in thread "main" org.apache.spark.sparkexception: job aborted due stage failure: task 0 in stage 8.0 failed 1 times, recent failure: lost task 0.0 in stage 8.0 (tid 4, localhost): java.lang.arrayindexoutofboundsexception: 3     @ org.apache.spark.rdd.datapreperation$5.call(datapreperation.java:93)     @ org.apache.spark.rdd.datapreperation$5.call(datapreperation.java:1)     @ org.apache.spark.api.java.javapairrdd$$anonfun$toscalafunction$1.apply(javapairrdd.scala:1027)     @ scala.collection.iterator$$anon$11.next(iterator.scala:328)     @ scala.collection.iterator$$anon$13.next(iterator.scala:372)     @ scala.collection.iterator$$anon$11.next(iterator.scala:328)     @ org.apache.spark.rdd.pairrddfunctions$$anonfun$saveashadoopdataset$1$$anonfun$13$$anonfun$apply$6.apply$mcv$sp(pairrddfunctions.scala:1109)     @ org.apache.spark.rdd.pairrddfunctions$$anonfun$saveashadoopdataset$1$$anonfun$13$$anonfun$apply$6.apply(pairrddfunctions.scala:1108)     @ org.apache.spark.rdd.pairrddfunctions$$anonfun$saveashadoopdataset$1$$anonfun$13$$anonfun$apply$6.apply(pairrddfunctions.scala:1108)     @ org.apache.spark.util.utils$.trywithsafefinally(utils.scala:1206)     @ org.apache.spark.rdd.pairrddfunctions$$anonfun$saveashadoopdataset$1$$anonfun$13.apply(pairrddfunctions.scala:1116)     @ org.apache.spark.rdd.pairrddfunctions$$anonfun$saveashadoopdataset$1$$anonfun$13.apply(pairrddfunctions.scala:1095)     @ org.apache.spark.scheduler.resulttask.runtask(resulttask.scala:66)     @ org.apache.spark.scheduler.task.run(task.scala:88)     @ org.apache.spark.executor.executor$taskrunner.run(executor.scala:214)     @ java.util.concurrent.threadpoolexecutor.runworker(threadpoolexecutor.java:1145)     @ java.util.concurrent.threadpoolexecutor$worker.run(threadpoolexecutor.java:615)     @ java.lang.thread.run(thread.java:745)  driver stacktrace:     @ org.apache.spark.scheduler.dagscheduler.org$apache$spark$scheduler$dagscheduler$$failjobandindependentstages(dagscheduler.scala:1280)     @ org.apache.spark.scheduler.dagscheduler$$anonfun$abortstage$1.apply(dagscheduler.scala:1268)     @ org.apache.spark.scheduler.dagscheduler$$anonfun$abortstage$1.apply(dagscheduler.scala:1267)     @ scala.collection.mutable.resizablearray$class.foreach(resizablearray.scala:59)     @ scala.collection.mutable.arraybuffer.foreach(arraybuffer.scala:47)     @ org.apache.spark.scheduler.dagscheduler.abortstage(dagscheduler.scala:1267)     @ org.apache.spark.scheduler.dagscheduler$$anonfun$handletasksetfailed$1.apply(dagscheduler.scala:697)     @ org.apache.spark.scheduler.dagscheduler$$anonfun$handletasksetfailed$1.apply(dagscheduler.scala:697)     @ scala.option.foreach(option.scala:236)     @ org.apache.spark.scheduler.dagscheduler.handletasksetfailed(dagscheduler.scala:697)     @ org.apache.spark.scheduler.dagschedulereventprocessloop.doonreceive(dagscheduler.scala:1493)     @ org.apache.spark.scheduler.dagschedulereventprocessloop.onreceive(dagscheduler.scala:1455)     @ org.apache.spark.scheduler.dagschedulereventprocessloop.onreceive(dagscheduler.scala:1444)     @ org.apache.spark.util.eventloop$$anon$1.run(eventloop.scala:48)     @ org.apache.spark.scheduler.dagscheduler.runjob(dagscheduler.scala:567)     @ org.apache.spark.sparkcontext.runjob(sparkcontext.scala:1813)     @ org.apache.spark.sparkcontext.runjob(sparkcontext.scala:1826)     @ org.apache.spark.sparkcontext.runjob(sparkcontext.scala:1903)     @ org.apache.spark.rdd.pairrddfunctions$$anonfun$saveashadoopdataset$1.apply$mcv$sp(pairrddfunctions.scala:1124)     @ org.apache.spark.rdd.pairrddfunctions$$anonfun$saveashadoopdataset$1.apply(pairrddfunctions.scala:1065)     @ org.apache.spark.rdd.pairrddfunctions$$anonfun$saveashadoopdataset$1.apply(pairrddfunctions.scala:1065)     @ org.apache.spark.rdd.rddoperationscope$.withscope(rddoperationscope.scala:147)     @ org.apache.spark.rdd.rddoperationscope$.withscope(rddoperationscope.scala:108)     @ org.apache.spark.rdd.rdd.withscope(rdd.scala:306)     @ org.apache.spark.rdd.pairrddfunctions.saveashadoopdataset(pairrddfunctions.scala:1065)     @ org.apache.spark.rdd.pairrddfunctions$$anonfun$saveashadoopfile$4.apply$mcv$sp(pairrddfunctions.scala:989)     @ org.apache.spark.rdd.pairrddfunctions$$anonfun$saveashadoopfile$4.apply(pairrddfunctions.scala:965)     @ org.apache.spark.rdd.pairrddfunctions$$anonfun$saveashadoopfile$4.apply(pairrddfunctions.scala:965)     @ org.apache.spark.rdd.rddoperationscope$.withscope(rddoperationscope.scala:147)     @ org.apache.spark.rdd.rddoperationscope$.withscope(rddoperationscope.scala:108)     @ org.apache.spark.rdd.rdd.withscope(rdd.scala:306)     @ org.apache.spark.rdd.pairrddfunctions.saveashadoopfile(pairrddfunctions.scala:965)     @ org.apache.spark.rdd.pairrddfunctions$$anonfun$saveashadoopfile$1.apply$mcv$sp(pairrddfunctions.scala:897)     @ org.apache.spark.rdd.pairrddfunctions$$anonfun$saveashadoopfile$1.apply(pairrddfunctions.scala:897)     @ org.apache.spark.rdd.pairrddfunctions$$anonfun$saveashadoopfile$1.apply(pairrddfunctions.scala:897)     @ org.apache.spark.rdd.rddoperationscope$.withscope(rddoperationscope.scala:147)     @ org.apache.spark.rdd.rddoperationscope$.withscope(rddoperationscope.scala:108)     @ org.apache.spark.rdd.rdd.withscope(rdd.scala:306)     @ org.apache.spark.rdd.pairrddfunctions.saveashadoopfile(pairrddfunctions.scala:896)     @ org.apache.spark.rdd.rdd$$anonfun$saveastextfile$1.apply$mcv$sp(rdd.scala:1426)     @ org.apache.spark.rdd.rdd$$anonfun$saveastextfile$1.apply(rdd.scala:1405)     @ org.apache.spark.rdd.rdd$$anonfun$saveastextfile$1.apply(rdd.scala:1405)     @ org.apache.spark.rdd.rddoperationscope$.withscope(rddoperationscope.scala:147)     @ org.apache.spark.rdd.rddoperationscope$.withscope(rddoperationscope.scala:108)     @ org.apache.spark.rdd.rdd.withscope(rdd.scala:306)     @ org.apache.spark.rdd.rdd.saveastextfile(rdd.scala:1405)     @ org.apache.spark.api.java.javarddlike$class.saveastextfile(javarddlike.scala:522)     @ org.apache.spark.api.java.abstractjavarddlike.saveastextfile(javarddlike.scala:47)     @ org.apache.spark.rdd.datapreperation.main(datapreperation.java:98) caused by: java.lang.arrayindexoutofboundsexception: 3     @ org.apache.spark.rdd.datapreperation$5.call(datapreperation.java:93)     @ org.apache.spark.rdd.datapreperation$5.call(datapreperation.java:1)     @ org.apache.spark.api.java.javapairrdd$$anonfun$toscalafunction$1.apply(javapairrdd.scala:1027)     @ scala.collection.iterator$$anon$11.next(iterator.scala:328)     @ scala.collection.iterator$$anon$13.next(iterator.scala:372)     @ scala.collection.iterator$$anon$11.next(iterator.scala:328)     @ org.apache.spark.rdd.pairrddfunctions$$anonfun$saveashadoopdataset$1$$anonfun$13$$anonfun$apply$6.apply$mcv$sp(pairrddfunctions.scala:1109)     @ org.apache.spark.rdd.pairrddfunctions$$anonfun$saveashadoopdataset$1$$anonfun$13$$anonfun$apply$6.apply(pairrddfunctions.scala:1108)     @ org.apache.spark.rdd.pairrddfunctions$$anonfun$saveashadoopdataset$1$$anonfun$13$$anonfun$apply$6.apply(pairrddfunctions.scala:1108)     @ org.apache.spark.util.utils$.trywithsafefinally(utils.scala:1206)     @ org.apache.spark.rdd.pairrddfunctions$$anonfun$saveashadoopdataset$1$$anonfun$13.apply(pairrddfunctions.scala:1116)     @ org.apache.spark.rdd.pairrddfunctions$$anonfun$saveashadoopdataset$1$$anonfun$13.apply(pairrddfunctions.scala:1095)     @ org.apache.spark.scheduler.resulttask.runtask(resulttask.scala:66)     @ org.apache.spark.scheduler.task.run(task.scala:88)     @ org.apache.spark.executor.executor$taskrunner.run(executor.scala:214)     @ java.util.concurrent.threadpoolexecutor.runworker(threadpoolexecutor.java:1145)     @ java.util.concurrent.threadpoolexecutor$worker.run(threadpoolexecutor.java:615)     @ java.lang.thread.run(thread.java:745) 16/01/06 09:05:05 info sparkcontext: invoking stop() shutdown hook 16/01/06 09:05:05 info sparkui: stopped spark web ui @ http://192.168.100.35:4040 16/01/06 09:05:05 info dagscheduler: stopping dagscheduler 16/01/06 09:05:05 info mapoutputtrackermasterendpoint: mapoutputtrackermasterendpoint stopped! 16/01/06 09:05:05 info memorystore: memorystore cleared 16/01/06 09:05:05 info blockmanager: blockmanager stopped 16/01/06 09:05:05 info blockmanagermaster: blockmanagermaster stopped 16/01/06 09:05:05 info outputcommitcoordinator$outputcommitcoordinatorendpoint: outputcommitcoordinator stopped! 16/01/06 09:05:05 info sparkcontext: stopped sparkcontext 16/01/06 09:05:05 info shutdownhookmanager: shutdown hook called 16/01/06 09:05:05 info shutdownhookmanager: deleting directory /tmp/spark-b90705cb-50d2-40fc-9518-e0aed907f570 

transid rdd of valueof pairrdd-custtranskp consists of 2 files customer.csv , transaction.csv.

whenever try access elements of transaction.csv return x._1().split(",")[3]; throws exception not when turn x._1().split(",")[2];

look maybe split dosent work expect ,try using split(",", -1) empty elements beetwen separator keeped in final rdd , array have same number of elements.

split(",",-1) means keep empty values @ end. default split(regex, 0) discards empty values


Comments

Popular posts from this blog

how to insert data php javascript mysql with multiple array session 2 -

multithreading - Exception in Application constructor -

windows - CertCreateCertificateContext returns CRYPT_E_ASN1_BADTAG / 8009310b -