Skip to content

Commit fd14339

Browse files
authored
AVRO-3666: [JAVA] Separate parsing from Schema class (apache#2513)
This allows using pluggable parser implementations, allowing multiple formats to be parsed with the same code. This includes the use of NameValidator and parsing multiple files with circular references between them.
1 parent 1cea690 commit fd14339

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

42 files changed

+2088
-694
lines changed

.editorconfig

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,8 @@ root = true
1919
charset = utf-8
2020
end_of_line = lf
2121
insert_final_newline = true
22+
ij_any_block_comment_at_first_column = false
23+
ij_any_line_comment_at_first_column = false
2224

2325
[*.{java,xml,sh}]
2426
indent_style = space

.mvn/extensions.xml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,6 @@
2020
<extension>
2121
<groupId>org.apache.maven.extensions</groupId>
2222
<artifactId>maven-build-cache-extension</artifactId>
23-
<version>1.0.0</version>
23+
<version>1.0.1</version>
2424
</extension>
2525
</extensions>

doc/content/en/docs/++version++/Getting started (Java)/_index.md

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -77,7 +77,7 @@ You may also build the required Avro jars from source. Building Avro is beyond t
7777

7878
## Defining a schema
7979

80-
Avro schemas are defined using JSON. Schemas are composed of primitive types (null, boolean, int, long, float, double, bytes, and string) and complex types (record, enum, array, map, union, and fixed). You can learn more about Avro schemas and types from the specification, but for now let's start with a simple schema example, user.avsc:
80+
Avro schemas are defined using JSON or IDL (the latter requires an extra dependency). Schemas are composed of primitive types (null, boolean, int, long, float, double, bytes, and string) and complex types (record, enum, array, map, union, and fixed). You can learn more about Avro schemas and types from the specification, but for now let's start with a simple schema example, user.avsc:
8181

8282
```json
8383
{"namespace": "example.avro",
@@ -209,10 +209,10 @@ Data in Avro is always stored with its corresponding schema, meaning we can alwa
209209
Let's go over the same example as in the previous section, but without using code generation: we'll create some users, serialize them to a data file on disk, and then read back the file and deserialize the users objects.
210210

211211
### Creating users
212-
First, we use a Parser to read our schema definition and create a Schema object.
212+
First, we use a SchemaParser to read our schema definition and create a Schema object.
213213

214214
```java
215-
Schema schema = new Schema.Parser().parse(new File("user.avsc"));
215+
Schema schema = new SchemaParser().parse(new File("user.avsc"));
216216
```
217217

218218
Using this schema, let's create some users.
Lines changed: 77 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,77 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one
3+
* or more contributor license agreements. See the NOTICE file
4+
* distributed with this work for additional information
5+
* regarding copyright ownership. The ASF licenses this file
6+
* to you under the Apache License, Version 2.0 (the
7+
* "License"); you may not use this file except in compliance
8+
* with the License. You may obtain a copy of the License at
9+
*
10+
* https://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing, software
13+
* distributed under the License is distributed on an "AS IS" BASIS,
14+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15+
* See the License for the specific language governing permissions and
16+
* limitations under the License.
17+
*/
18+
package org.apache.avro;
19+
20+
import java.io.IOException;
21+
import java.net.URI;
22+
23+
/**
24+
* Schema parser for a specific schema format.
25+
*
26+
* <p>
27+
* The {@link SchemaParser} class uses this interface, supporting text based
28+
* schema sources.
29+
* </p>
30+
*
31+
* <p>
32+
* Implementations are located using a {@link java.util.ServiceLoader} and must
33+
* therefore be threadsafe. See the {@code ServiceLoader} class for details on
34+
* loading your implementation.
35+
* </p>
36+
*
37+
* @see java.util.ServiceLoader
38+
*/
39+
public interface FormattedSchemaParser {
40+
/**
41+
* <p>
42+
* Parse schema definitions from a text based source.
43+
* </p>
44+
*
45+
* <h2>Notes for implementers:</h2>
46+
*
47+
* <ul>
48+
* <li>Schema definitions are expected not to be in the format the parser
49+
* expects. So when the input clearly doesn't make sense (e.g., reading "/**"
50+
* when expecting JSON), it is a good idea not to do anything (especially
51+
* calling methods on the @code ParseContext}).</li>
52+
* <li>The parameter {@code parseContext} is not thread-safe.</li>
53+
* <li>When parsing, all parsed schema definitions should be added to the
54+
* provided {@link ParseContext}.</li>
55+
* <li>Optionally, you may return a "main" schema. Some schema definitions have
56+
* one, for example the schema defined by the root of the JSON document in a
57+
* <a href="https://avro.apache.org/docs/current/specification/">standard schema
58+
* definition</a>. If unsure, return {@code null}.</li>
59+
* <li>If parsing fails, throw a {@link SchemaParseException}. This will let the
60+
* parsing process recover and continue.</li>
61+
* <li>Throwing anything other than a {@code SchemaParseException} will abort
62+
* the parsing process, so reserve that for rethrowing exceptions.</li>
63+
* </ul>
64+
*
65+
* @param parseContext the current parse context: all parsed schemata should
66+
* be added here to resolve names with; contains all
67+
* previously known types
68+
* @param baseUri the base location of the schema, or {@code null} if
69+
* not known
70+
* @param formattedSchema the text of the schema definition(s) to parse
71+
* @return the main schema, if any
72+
* @throws IOException when the schema cannot be read
73+
* @throws SchemaParseException when the schema cannot be parsed
74+
*/
75+
Schema parse(ParseContext parseContext, URI baseUri, CharSequence formattedSchema)
76+
throws IOException, SchemaParseException;
77+
}
Lines changed: 82 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,82 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one
3+
* or more contributor license agreements. See the NOTICE file
4+
* distributed with this work for additional information
5+
* regarding copyright ownership. The ASF licenses this file
6+
* to you under the Apache License, Version 2.0 (the
7+
* "License"); you may not use this file except in compliance
8+
* with the License. You may obtain a copy of the License at
9+
*
10+
* https://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing, software
13+
* distributed under the License is distributed on an "AS IS" BASIS,
14+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15+
* See the License for the specific language governing permissions and
16+
* limitations under the License.
17+
*/
18+
package org.apache.avro;
19+
20+
import java.io.IOException;
21+
import java.net.URI;
22+
23+
/**
24+
* Schema parser for JSON formatted schemata. This initial implementation simply
25+
* delegates to the {@link Schema.Parser} class, though it should be refactored
26+
* out of there.
27+
*
28+
* <p>
29+
* Note: this class is intentionally not available via the Java
30+
* {@link java.util.ServiceLoader}, as its use is hardcoded as fallback when no
31+
* service exists. This enables users to reliably override the standard JSON
32+
* parser as well.
33+
* </p>
34+
*/
35+
public class JsonSchemaParser implements FormattedSchemaParser {
36+
/**
37+
* <p>
38+
* Parse a schema written in the internal (JSON) format without any validations.
39+
* </p>
40+
*
41+
* <p>
42+
* Using this method is only safe if used to parse a write schema (i.e., a
43+
* schema used to read Avro data). Other usages, for example by generated Avro
44+
* code, can cause interoperability problems.
45+
* </p>
46+
*
47+
* <p>
48+
* Use with care and sufficient testing!
49+
* </p>
50+
*
51+
* @param fragments one or more strings making up the schema (some schemata
52+
* exceed the compiler limits)
53+
* @return the parsed schema
54+
*/
55+
public static Schema parseInternal(String... fragments) {
56+
StringBuilder buffer = new StringBuilder();
57+
for (String fragment : fragments) {
58+
buffer.append(fragment);
59+
}
60+
return new JsonSchemaParser().parse(new ParseContext(NameValidator.NO_VALIDATION), buffer, null);
61+
}
62+
63+
@Override
64+
public Schema parse(ParseContext parseContext, URI baseUri, CharSequence formattedSchema)
65+
throws IOException, SchemaParseException {
66+
return parse(parseContext, formattedSchema, parseContext.nameValidator);
67+
}
68+
69+
private Schema parse(ParseContext parseContext, CharSequence formattedSchema, NameValidator nameValidator)
70+
throws SchemaParseException {
71+
Schema.Parser parser = new Schema.Parser(nameValidator);
72+
if (nameValidator == NameValidator.NO_VALIDATION) {
73+
parser.setValidateDefaults(false);
74+
} else {
75+
parser = new Schema.Parser(nameValidator);
76+
}
77+
parser.addTypes(parseContext.typesByName().values());
78+
Schema schema = parser.parse(formattedSchema.toString());
79+
parser.getTypes().values().forEach(parseContext::put);
80+
return schema;
81+
}
82+
}
Lines changed: 104 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,104 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one
3+
* or more contributor license agreements. See the NOTICE file
4+
* distributed with this work for additional information
5+
* regarding copyright ownership. The ASF licenses this file
6+
* to you under the Apache License, Version 2.0 (the
7+
* "License"); you may not use this file except in compliance
8+
* with the License. You may obtain a copy of the License at
9+
*
10+
* https://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing, software
13+
* distributed under the License is distributed on an "AS IS" BASIS,
14+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15+
* See the License for the specific language governing permissions and
16+
* limitations under the License.
17+
*/
18+
package org.apache.avro;
19+
20+
public interface NameValidator {
21+
22+
class Result {
23+
private final String errors;
24+
25+
public Result(final String errors) {
26+
this.errors = errors;
27+
}
28+
29+
public boolean isOK() {
30+
return this == NameValidator.OK;
31+
}
32+
33+
public String getErrors() {
34+
return errors;
35+
}
36+
}
37+
38+
Result OK = new Result(null);
39+
40+
default Result validate(String name) {
41+
return OK;
42+
}
43+
44+
NameValidator NO_VALIDATION = new NameValidator() {
45+
};
46+
47+
NameValidator UTF_VALIDATOR = new NameValidator() {
48+
@Override
49+
public Result validate(final String name) {
50+
if (name == null) {
51+
return new Result("Null name");
52+
}
53+
int length = name.length();
54+
if (length == 0) {
55+
return new Result("Empty name");
56+
}
57+
char first = name.charAt(0);
58+
if (!(Character.isLetter(first) || first == '_')) {
59+
return new Result("Illegal initial character: " + name);
60+
}
61+
for (int i = 1; i < length; i++) {
62+
char c = name.charAt(i);
63+
if (!(Character.isLetterOrDigit(c) || c == '_')) {
64+
return new Result("Illegal character in: " + name);
65+
}
66+
}
67+
return OK;
68+
}
69+
};
70+
71+
NameValidator STRICT_VALIDATOR = new NameValidator() {
72+
@Override
73+
public Result validate(final String name) {
74+
if (name == null) {
75+
return new Result("Null name");
76+
}
77+
int length = name.length();
78+
if (length == 0) {
79+
return new Result("Empty name");
80+
}
81+
char first = name.charAt(0);
82+
if (!(isLetter(first) || first == '_')) {
83+
return new Result("Illegal initial character: " + name);
84+
}
85+
for (int i = 1; i < length; i++) {
86+
char c = name.charAt(i);
87+
if (!(isLetter(c) || isDigit(c) || c == '_')) {
88+
return new Result("Illegal character in: " + name);
89+
}
90+
}
91+
return OK;
92+
}
93+
94+
private boolean isLetter(char c) {
95+
return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z');
96+
}
97+
98+
private boolean isDigit(char c) {
99+
return c >= '0' && c <= '9';
100+
}
101+
102+
};
103+
104+
}

0 commit comments

Comments
 (0)