Skip to content
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.

Commit 7a0311d

Browse files
author
Paul Yang
committedApr 15, 2012
HIVE-2942. substr on string containing UTF-8 characters produces StringIndexOutOfBoundsException (Kevin Wilfong via pauly)
git-svn-id: https://svn.apache.org/repos/asf/hive/trunk@1326444 13f79535-47bb-0310-9956-ffa450edef68
1 parent 9cc4066 commit 7a0311d

File tree

4 files changed

+38
-9
lines changed

4 files changed

+38
-9
lines changed
 

‎ql/src/java/org/apache/hadoop/hive/ql/udf/UDFSubstr.java

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -18,14 +18,14 @@
1818

1919
package org.apache.hadoop.hive.ql.udf;
2020

21+
import java.util.Arrays;
22+
2123
import org.apache.hadoop.hive.ql.exec.Description;
2224
import org.apache.hadoop.hive.ql.exec.UDF;
2325
import org.apache.hadoop.io.BytesWritable;
2426
import org.apache.hadoop.io.IntWritable;
2527
import org.apache.hadoop.io.Text;
2628

27-
import java.util.Arrays;
28-
2929
/**
3030
* UDFSubstr.
3131
*
@@ -65,12 +65,12 @@ public Text evaluate(Text t, IntWritable pos, IntWritable len) {
6565
return r;
6666
}
6767

68-
int[] index = makeIndex(pos.get(), len.get(), t.getLength());
68+
String s = t.toString();
69+
int[] index = makeIndex(pos.get(), len.get(), s.length());
6970
if (index == null) {
7071
return r;
7172
}
7273

73-
String s = t.toString();
7474
r.set(s.substring(index[0], index[1]));
7575
return r;
7676
}

‎ql/src/test/org/apache/hadoop/hive/ql/QTestUtil.java

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -23,13 +23,13 @@
2323
import java.io.BufferedInputStream;
2424
import java.io.BufferedReader;
2525
import java.io.BufferedWriter;
26-
import java.io.DataInputStream;
2726
import java.io.File;
2827
import java.io.FileInputStream;
2928
import java.io.FileNotFoundException;
3029
import java.io.FileOutputStream;
3130
import java.io.FileReader;
3231
import java.io.FileWriter;
32+
import java.io.InputStreamReader;
3333
import java.io.PrintStream;
3434
import java.io.Serializable;
3535
import java.io.UnsupportedEncodingException;
@@ -279,7 +279,7 @@ public void addFile(File qf) throws Exception {
279279

280280
FileInputStream fis = new FileInputStream(qf);
281281
BufferedInputStream bis = new BufferedInputStream(fis);
282-
DataInputStream dis = new DataInputStream(bis);
282+
BufferedReader br = new BufferedReader(new InputStreamReader(bis, "UTF8"));
283283
StringBuilder qsb = new StringBuilder();
284284

285285
// Look for a hint to not run a test on some Hadoop versions
@@ -289,8 +289,8 @@ public void addFile(File qf) throws Exception {
289289
// Read the entire query
290290
boolean excludeQuery = false;
291291
String hadoopVer = ShimLoader.getMajorVersion();
292-
while (dis.available() != 0) {
293-
String line = dis.readLine();
292+
String line;
293+
while ((line = br.readLine()) != null) {
294294

295295
// While we are reading the lines, detect whether this query wants to be
296296
// excluded from running because the Hadoop version is incorrect
@@ -320,7 +320,7 @@ public void addFile(File qf) throws Exception {
320320
"adding query " + qf.getName() + " to the set of tests to skip");
321321
qSkipSet.add(qf.getName());
322322
}
323-
dis.close();
323+
br.close();
324324
}
325325

326326
/**

‎ql/src/test/queries/clientpositive/udf_substr.q

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -65,3 +65,11 @@ SELECT
6565
FROM (
6666
select CAST(concat(substr(value, 1, 0), 'ABC') as BINARY) as ABC from src LIMIT 1
6767
) X;
68+
69+
-- test UTF-8 substr
70+
SELECT
71+
substr("玩", 1),
72+
substr("abc 玩", 5),
73+
substr("abc 玩玩玩 abc", 5),
74+
substr("abc 玩玩玩 abc", 5, 3)
75+
FROM src LIMIT 1;

‎ql/src/test/results/clientpositive/udf_substr.q.out

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -182,3 +182,24 @@ POSTHOOK: type: QUERY
182182
POSTHOOK: Input: default@src
183183
#### A masked pattern was here ####
184184
NULL NULL null null null A AB ABC ABC A AB ABC ABC B BC BC BC C C C C C C C C B BC BC BC A AB ABC ABC
185+
PREHOOK: query: -- test UTF-8 substr
186+
SELECT
187+
substr("玩", 1),
188+
substr("abc 玩", 5),
189+
substr("abc 玩玩玩 abc", 5),
190+
substr("abc 玩玩玩 abc", 5, 3)
191+
FROM src LIMIT 1
192+
PREHOOK: type: QUERY
193+
PREHOOK: Input: default@src
194+
#### A masked pattern was here ####
195+
POSTHOOK: query: -- test UTF-8 substr
196+
SELECT
197+
substr("玩", 1),
198+
substr("abc 玩", 5),
199+
substr("abc 玩玩玩 abc", 5),
200+
substr("abc 玩玩玩 abc", 5, 3)
201+
FROM src LIMIT 1
202+
POSTHOOK: type: QUERY
203+
POSTHOOK: Input: default@src
204+
#### A masked pattern was here ####
205+
玩 玩 玩玩玩 abc 玩玩玩

0 commit comments

Comments
 (0)
Please sign in to comment.