16
16
package eu .stratosphere .pact .common .io ;
17
17
18
18
import java .io .IOException ;
19
+ import java .io .UnsupportedEncodingException ;
19
20
import java .net .URI ;
20
21
import java .util .ArrayList ;
21
22
import java .util .List ;
30
31
import eu .stratosphere .nephele .fs .FileSystem ;
31
32
import eu .stratosphere .nephele .fs .LineReader ;
32
33
import eu .stratosphere .nephele .fs .Path ;
34
+ import eu .stratosphere .pact .common .contract .FileDataSource ;
33
35
import eu .stratosphere .pact .common .io .statistics .BaseStatistics ;
34
36
import eu .stratosphere .pact .common .type .PactRecord ;
35
37
43
45
*/
44
46
public abstract class DelimitedInputFormat extends FileInputFormat
45
47
{
46
- /**
47
- * The configuration key to set the record delimiter.
48
- */
49
- public static final String RECORD_DELIMITER = "delimited-format.delimiter" ;
50
-
51
- /**
52
- * The configuration key to set the number of samples to take for the statistics.
53
- */
54
- public static final String NUM_STATISTICS_SAMPLES = "delimited-format.numSamples" ;
48
+ // -------------------------------------- Constants -------------------------------------------
55
49
56
50
/**
57
51
* The log.
@@ -68,6 +62,23 @@ public abstract class DelimitedInputFormat extends FileInputFormat
68
62
*/
69
63
private static final int DEFAULT_NUM_SAMPLES = 10 ;
70
64
65
+ // ------------------------------------- Config Keys ------------------------------------------
66
+
67
+ /**
68
+ * The configuration key to set the record delimiter.
69
+ */
70
+ public static final String RECORD_DELIMITER = "delimited-format.delimiter" ;
71
+
72
+ /**
73
+ * The configuration key to set the record delimiter encoding.
74
+ */
75
+ private static final String RECORD_DELIMITER_ENCODING = "delimited-format.delimiter-encoding" ;
76
+
77
+ /**
78
+ * The configuration key to set the number of samples to take for the statistics.
79
+ */
80
+ private static final String NUM_STATISTICS_SAMPLES = "delimited-format.numSamples" ;
81
+
71
82
// --------------------------------------------------------------------------------------------
72
83
73
84
protected byte [] readBuffer ;
@@ -78,7 +89,7 @@ public abstract class DelimitedInputFormat extends FileInputFormat
78
89
79
90
protected int limit ;
80
91
81
- protected byte [] delimiter = new byte [] { '\n' };
92
+ protected byte [] delimiter = new byte [] {'\n' };
82
93
83
94
private byte [] currBuffer ;
84
95
private int currOffset ;
@@ -151,12 +162,18 @@ public void configure(Configuration parameters)
151
162
{
152
163
super .configure (parameters );
153
164
154
- String delimString = parameters .getString (RECORD_DELIMITER , " \n " );
165
+ final String delimString = parameters .getString (RECORD_DELIMITER , AbstractConfigBuilder . NEWLINE_DELIMITER );
155
166
if (delimString == null ) {
156
167
throw new IllegalArgumentException ("The delimiter not be null." );
157
168
}
169
+ final String charsetName = parameters .getString (RECORD_DELIMITER_ENCODING , null );
158
170
159
- this .delimiter = delimString .getBytes ();
171
+ try {
172
+ this .delimiter = charsetName == null ? delimString .getBytes () : delimString .getBytes (charsetName );
173
+ } catch (UnsupportedEncodingException useex ) {
174
+ throw new IllegalArgumentException ("The charset with the name '" + charsetName +
175
+ "' is not supported on this TaskManager instance." , useex );
176
+ }
160
177
161
178
// set the number of samples
162
179
this .numLineSamples = DEFAULT_NUM_SAMPLES ;
@@ -374,6 +391,9 @@ public boolean reachedEnd()
374
391
return this .end ;
375
392
}
376
393
394
+ /* (non-Javadoc)
395
+ * @see eu.stratosphere.pact.common.generic.io.InputFormat#nextRecord(java.lang.Object)
396
+ */
377
397
@ Override
378
398
public boolean nextRecord (PactRecord record ) throws IOException
379
399
{
@@ -501,4 +521,117 @@ private final boolean fillBuffer() throws IOException {
501
521
return true ;
502
522
}
503
523
}
524
+
525
+ // ============================================================================================
526
+
527
+ /**
528
+ * Creates a configuration builder that can be used to set the input format's parameters to the config in a fluent
529
+ * fashion.
530
+ *
531
+ * @return A config builder for setting parameters.
532
+ */
533
+ public static ConfigBuilder configureDelimitedFormat (FileDataSource target ) {
534
+ return new ConfigBuilder (target .getParameters ());
535
+ }
536
+
537
+ /**
538
+ * Abstract builder used to set parameters to the input format's configuration in a fluent way.
539
+ */
540
+ protected static class AbstractConfigBuilder <T > extends FileInputFormat .AbstractConfigBuilder <T >
541
+ {
542
+ private static final String NEWLINE_DELIMITER = "\n " ;
543
+
544
+ // --------------------------------------------------------------------
545
+
546
+ /**
547
+ * Creates a new builder for the given configuration.
548
+ *
549
+ * @param targetConfig The configuration into which the parameters will be written.
550
+ */
551
+ protected AbstractConfigBuilder (Configuration config ) {
552
+ super (config );
553
+ }
554
+
555
+ // --------------------------------------------------------------------
556
+
557
+ /**
558
+ * Sets the delimiter to be a single character, namely the given one. The character must be within
559
+ * the value range <code>0</code> to <code>127</code>.
560
+ *
561
+ * @param delimiter The delimiter character.
562
+ * @return The builder itself.
563
+ */
564
+ public T recordDelimiter (char delimiter ) {
565
+ if (delimiter == '\n' ) {
566
+ this .config .setString (RECORD_DELIMITER , NEWLINE_DELIMITER );
567
+ } else {
568
+ this .config .setString (RECORD_DELIMITER , String .valueOf (delimiter ));
569
+ }
570
+ @ SuppressWarnings ("unchecked" )
571
+ T ret = (T ) this ;
572
+ return ret ;
573
+ }
574
+
575
+ /**
576
+ * Sets the delimiter to be the given string. The string will be converted to bytes for more efficient
577
+ * comparison during input parsing. The conversion will be done using the platforms default charset.
578
+ *
579
+ * @param delimiter The delimiter string.
580
+ * @return The builder itself.
581
+ */
582
+ public T recordDelimiter (String delimiter ) {
583
+ this .config .setString (RECORD_DELIMITER , delimiter );
584
+ @ SuppressWarnings ("unchecked" )
585
+ T ret = (T ) this ;
586
+ return ret ;
587
+ }
588
+
589
+ /**
590
+ * Sets the delimiter to be the given string. The string will be converted to bytes for more efficient
591
+ * comparison during input parsing. The conversion will be done using the charset with the given name.
592
+ * The charset must be available on the processing nodes, otherwise an exception will be raised at
593
+ * runtime.
594
+ *
595
+ * @param delimiter The delimiter string.
596
+ * @param charsetName The name of the encoding character set.
597
+ * @return The builder itself.
598
+ */
599
+ public T recordDelimiter (String delimiter , String charsetName ) {
600
+ this .config .setString (RECORD_DELIMITER , delimiter );
601
+ this .config .setString (RECORD_DELIMITER_ENCODING , charsetName );
602
+ @ SuppressWarnings ("unchecked" )
603
+ T ret = (T ) this ;
604
+ return ret ;
605
+ }
606
+
607
+ /**
608
+ * Sets the number of line samples to take in order to estimate the base statistics for the
609
+ * input format.
610
+ *
611
+ * @param numSamples The number of line samples to take.
612
+ * @return The builder itself.
613
+ */
614
+ public T numSamplesForStatistics (int numSamples ) {
615
+ this .config .setInteger (NUM_STATISTICS_SAMPLES , numSamples );
616
+ @ SuppressWarnings ("unchecked" )
617
+ T ret = (T ) this ;
618
+ return ret ;
619
+ }
620
+ }
621
+
622
+ /**
623
+ * A builder used to set parameters to the input format's configuration in a fluent way.
624
+ */
625
+ public static class ConfigBuilder extends AbstractConfigBuilder <ConfigBuilder >
626
+ {
627
+ /**
628
+ * Creates a new builder for the given configuration.
629
+ *
630
+ * @param targetConfig The configuration into which the parameters will be written.
631
+ */
632
+ protected ConfigBuilder (Configuration targetConfig ) {
633
+ super (targetConfig );
634
+ }
635
+
636
+ }
504
637
}
0 commit comments