public class HoodieAvroUtils extends Object
| Modifier and Type | Field and Description |
|---|---|
static String |
AVRO_VERSION |
static org.apache.avro.Conversions.DecimalConversion |
DECIMAL_CONVERSION |
static org.apache.avro.Schema |
METADATA_FIELD_SCHEMA |
static org.apache.avro.Schema |
RECORD_KEY_SCHEMA |
| Constructor and Description |
|---|
HoodieAvroUtils() |
| Modifier and Type | Method and Description |
|---|---|
static org.apache.avro.generic.GenericRecord |
addCommitMetadataToRecord(org.apache.avro.generic.GenericRecord record,
String instantTime,
String commitSeqno)
Adds the Hoodie commit metadata into the provided Generic Record.
|
static org.apache.avro.generic.GenericRecord |
addHoodieKeyToRecord(org.apache.avro.generic.GenericRecord record,
String recordKey,
String partitionPath,
String fileName) |
static String |
addMetadataColumnTypes(String hiveColumnTypes) |
static org.apache.avro.Schema |
addMetadataFields(org.apache.avro.Schema schema)
Adds the Hoodie metadata fields to the given schema.
|
static org.apache.avro.Schema |
addMetadataFields(org.apache.avro.Schema schema,
boolean withOperationField)
Adds the Hoodie metadata fields to the given schema.
|
static org.apache.avro.generic.GenericRecord |
addOperationToRecord(org.apache.avro.generic.GenericRecord record,
HoodieOperation operation) |
static byte[] |
avroToBytes(org.apache.avro.generic.GenericRecord record)
Convert a given avro record to bytes.
|
static byte[] |
avroToJson(org.apache.avro.generic.GenericRecord record,
boolean pretty)
Convert a given avro record to json and return the encoded bytes.
|
static String |
avroToJsonString(org.apache.avro.generic.GenericRecord record,
boolean pretty)
Convert a given avro record to json and return the string
|
static org.apache.avro.generic.GenericRecord |
bytesToAvro(byte[] bytes,
int offset,
int length,
org.apache.avro.Schema writerSchema,
org.apache.avro.Schema readerSchema)
Convert serialized bytes back into avro record.
|
static org.apache.avro.generic.GenericRecord |
bytesToAvro(byte[] bytes,
org.apache.avro.Schema schema)
Convert serialized bytes back into avro record.
|
static org.apache.avro.generic.GenericRecord |
bytesToAvro(byte[] bytes,
org.apache.avro.Schema writerSchema,
org.apache.avro.Schema readerSchema)
Convert serialized bytes back into avro record.
|
static Object |
convertValueForSpecificDataTypes(org.apache.avro.Schema fieldSchema,
Object fieldValue,
boolean consistentLogicalTimestampEnabled)
This method converts values for fields with certain Avro/Parquet data types that require special handling.
|
static String |
createFullName(Deque<String> fieldNames) |
static HoodieRecord |
createHoodieRecordFromAvro(org.apache.avro.generic.IndexedRecord data,
String payloadClass,
String preCombineField,
Option<Pair<String,String>> simpleKeyGenFieldsOpt,
Boolean withOperation,
Option<String> partitionNameOp,
Boolean populateMetaFields,
Option<org.apache.avro.Schema> schemaWithoutMetaFields) |
static org.apache.avro.Schema |
createHoodieWriteSchema(org.apache.avro.Schema originalSchema) |
static org.apache.avro.Schema |
createHoodieWriteSchema(String originalSchema) |
static org.apache.avro.Schema |
createHoodieWriteSchema(String originalSchema,
boolean withOperationField) |
static int |
fromJavaDate(Date date)
convert Date to days
|
static org.apache.avro.Schema |
generateProjectionSchema(org.apache.avro.Schema originalSchema,
List<String> fieldNames)
Generate a reader schema off the provided writeSchema, to just project out the provided columns.
|
static Object |
getFieldVal(org.apache.avro.generic.GenericRecord record,
String key)
Obtain value of the provided key, which is consistent with avro before 1.10
|
static Object |
getFieldVal(org.apache.avro.generic.GenericRecord record,
String key,
boolean returnNullIfNotFound)
Obtain value of the provided key, when set returnNullIfNotFound false,
it is consistent with avro after 1.10
|
static org.apache.avro.Schema |
getNestedFieldSchemaFromRecord(org.apache.avro.generic.GenericRecord record,
String fieldName)
Get schema for the given field and record.
|
static org.apache.avro.Schema |
getNestedFieldSchemaFromWriteSchema(org.apache.avro.Schema writeSchema,
String fieldName)
Get schema for the given field and write schema.
|
static Object |
getNestedFieldVal(org.apache.avro.generic.GenericRecord record,
String fieldName,
boolean returnNullIfNotFound,
boolean consistentLogicalTimestampEnabled)
Obtain value of the provided field, denoted by dot notation.
|
static String |
getNestedFieldValAsString(org.apache.avro.generic.GenericRecord record,
String fieldName,
boolean returnNullIfNotFound,
boolean consistentLogicalTimestampEnabled)
Obtain value of the provided field as string, denoted by dot notation.
|
static Option<String> |
getNullableValAsString(org.apache.avro.generic.GenericRecord rec,
String fieldName)
Returns the string value of the given record
rec and field fieldName. |
static org.apache.avro.Schema |
getNullSchema() |
static Object[] |
getRecordColumnValues(HoodieRecord record,
String[] columns,
org.apache.avro.Schema schema,
boolean consistentLogicalTimestampEnabled)
Gets record column values into object array.
|
static Object |
getRecordColumnValues(HoodieRecord record,
String[] columns,
SerializableSchema schema,
boolean consistentLogicalTimestampEnabled)
Gets record column values into one object.
|
static org.apache.avro.Schema |
getRecordKeyPartitionPathSchema()
Fetch schema for record key and partition path.
|
static org.apache.avro.Schema |
getRecordKeySchema() |
static String |
getRootLevelFieldName(String fieldName)
Obtain the root-level field name of a full field name, possibly a nested field.
|
static org.apache.avro.Schema |
getSchemaForFields(org.apache.avro.Schema fileSchema,
List<String> fields)
Fetch schema for record key and partition path.
|
static boolean |
gteqAvro1_10() |
static boolean |
gteqAvro1_9() |
static <T extends org.apache.avro.generic.IndexedRecord> |
indexedRecordToBytes(T record) |
static boolean |
isMetadataField(String fieldName) |
static boolean |
isSchemaNull(org.apache.avro.Schema schema) |
static boolean |
isTypeNumeric(org.apache.avro.Schema.Type type) |
static org.apache.avro.generic.GenericRecord |
jsonBytesToAvro(byte[] bytes,
org.apache.avro.Schema schema)
Convert json bytes back into avro record.
|
static org.apache.avro.Schema |
makeFieldNonNull(org.apache.avro.Schema schema,
String fieldName,
Object fieldDefaultValue) |
static boolean |
recordNeedsRewriteForExtendedAvroTypePromotion(org.apache.avro.Schema writerSchema,
org.apache.avro.Schema readerSchema)
Avro does not support type promotion from numbers to string.
|
static Option<byte[]> |
recordToBytes(HoodieRecord record,
org.apache.avro.Schema schema)
TODO serialize other type of record.
|
static org.apache.avro.generic.GenericRecord |
removeFields(org.apache.avro.generic.GenericRecord record,
Set<String> fieldsToRemove)
Given an Avro record and list of columns to remove, this method removes the list of columns from
the given avro record using rewriteRecord method.
|
static org.apache.avro.Schema |
removeFields(org.apache.avro.Schema schema,
Set<String> fieldsToRemove) |
static org.apache.avro.Schema |
removeMetadataFields(org.apache.avro.Schema schema) |
static org.apache.avro.generic.GenericRecord |
rewriteEvolutionRecordWithMetadata(org.apache.avro.generic.GenericRecord genericRecord,
org.apache.avro.Schema newSchema,
String fileName) |
static org.apache.avro.generic.GenericRecord |
rewriteRecord(org.apache.avro.generic.GenericRecord oldRecord,
org.apache.avro.Schema newSchema)
Given an Avro record with a given schema, rewrites it into the new schema while setting fields only from the new
schema.
|
static org.apache.avro.generic.GenericRecord |
rewriteRecordDeep(org.apache.avro.generic.GenericRecord oldRecord,
org.apache.avro.Schema newSchema) |
static org.apache.avro.generic.GenericRecord |
rewriteRecordDeep(org.apache.avro.generic.GenericRecord oldRecord,
org.apache.avro.Schema newSchema,
boolean validate) |
static List<org.apache.avro.generic.GenericRecord> |
rewriteRecords(List<org.apache.avro.generic.GenericRecord> records,
org.apache.avro.Schema newSchema)
Converts list of
GenericRecord provided into the GenericRecord adhering to the
provided newSchema. |
static org.apache.avro.generic.GenericRecord |
rewriteRecordWithMetadata(org.apache.avro.generic.GenericRecord genericRecord,
org.apache.avro.Schema newSchema,
String fileName) |
static org.apache.avro.generic.GenericRecord |
rewriteRecordWithNewSchema(org.apache.avro.generic.IndexedRecord oldRecord,
org.apache.avro.Schema newSchema) |
static org.apache.avro.generic.GenericRecord |
rewriteRecordWithNewSchema(org.apache.avro.generic.IndexedRecord oldRecord,
org.apache.avro.Schema newSchema,
Map<String,String> renameCols)
Given a avro record with a given schema, rewrites it into the new schema while setting fields only from the new schema.
|
static org.apache.avro.generic.GenericRecord |
rewriteRecordWithNewSchema(org.apache.avro.generic.IndexedRecord oldRecord,
org.apache.avro.Schema newSchema,
Map<String,String> renameCols,
boolean validate) |
static Iterator<org.apache.avro.generic.GenericRecord> |
rewriteRecordWithNewSchema(Iterator<org.apache.avro.generic.GenericRecord> oldRecords,
org.apache.avro.Schema newSchema,
Map<String,String> renameCols) |
static Iterator<org.apache.avro.generic.GenericRecord> |
rewriteRecordWithNewSchema(Iterator<org.apache.avro.generic.GenericRecord> oldRecords,
org.apache.avro.Schema newSchema,
Map<String,String> renameCols,
boolean validate)
Given avro records, rewrites them with new schema.
|
static String |
safeAvroToJsonString(org.apache.avro.generic.GenericRecord record)
Convert a given avro record to a JSON string.
|
static String |
sanitizeName(String name)
Sanitizes Name according to Avro rule for names.
|
static String |
sanitizeName(String name,
String invalidCharMask)
Sanitizes Name according to Avro rule for names.
|
static org.apache.avro.generic.GenericRecord |
stitchRecords(org.apache.avro.generic.GenericRecord left,
org.apache.avro.generic.GenericRecord right,
org.apache.avro.Schema stitchedSchema) |
static Date |
toJavaDate(int days)
convert days to Date
|
static Comparable<?> |
unwrapAvroValueWrapper(Object avroValueWrapper)
Unwraps Avro value wrapper into Java value.
|
static Object |
wrapValueIntoAvro(Comparable<?> value)
Wraps a value into Avro type wrapper.
|
public static final String AVRO_VERSION
public static final org.apache.avro.Conversions.DecimalConversion DECIMAL_CONVERSION
public static final org.apache.avro.Schema METADATA_FIELD_SCHEMA
public static final org.apache.avro.Schema RECORD_KEY_SCHEMA
public static Option<byte[]> recordToBytes(HoodieRecord record, org.apache.avro.Schema schema) throws IOException
IOExceptionpublic static byte[] avroToBytes(org.apache.avro.generic.GenericRecord record)
public static <T extends org.apache.avro.generic.IndexedRecord> byte[] indexedRecordToBytes(T record)
public static String avroToJsonString(org.apache.avro.generic.GenericRecord record, boolean pretty) throws IOException
record - The GenericRecord to convertpretty - Whether to pretty-print the json outputIOExceptionpublic static String safeAvroToJsonString(org.apache.avro.generic.GenericRecord record)
avroToJsonString(org.apache.avro.generic.GenericRecord, boolean) when simply trying to print the record contents without any guarantees around their correctness.record - The GenericRecord to convertpublic static byte[] avroToJson(org.apache.avro.generic.GenericRecord record,
boolean pretty)
throws IOException
record - The GenericRecord to convertpretty - Whether to pretty-print the json outputIOExceptionpublic static org.apache.avro.generic.GenericRecord bytesToAvro(byte[] bytes,
org.apache.avro.Schema schema)
throws IOException
IOExceptionpublic static org.apache.avro.generic.GenericRecord bytesToAvro(byte[] bytes,
org.apache.avro.Schema writerSchema,
org.apache.avro.Schema readerSchema)
throws IOException
IOExceptionpublic static org.apache.avro.generic.GenericRecord bytesToAvro(byte[] bytes,
int offset,
int length,
org.apache.avro.Schema writerSchema,
org.apache.avro.Schema readerSchema)
throws IOException
IOExceptionpublic static org.apache.avro.generic.GenericRecord jsonBytesToAvro(byte[] bytes,
org.apache.avro.Schema schema)
throws IOException
IOExceptionpublic static boolean isTypeNumeric(org.apache.avro.Schema.Type type)
public static boolean isMetadataField(String fieldName)
public static org.apache.avro.Schema createHoodieWriteSchema(org.apache.avro.Schema originalSchema)
public static org.apache.avro.Schema createHoodieWriteSchema(String originalSchema)
public static org.apache.avro.Schema createHoodieWriteSchema(String originalSchema, boolean withOperationField)
public static org.apache.avro.Schema addMetadataFields(org.apache.avro.Schema schema)
schema - The schemapublic static org.apache.avro.Schema addMetadataFields(org.apache.avro.Schema schema,
boolean withOperationField)
schema - The schemawithOperationField - Whether to include the '_hoodie_operation' fieldpublic static boolean isSchemaNull(org.apache.avro.Schema schema)
public static org.apache.avro.Schema removeMetadataFields(org.apache.avro.Schema schema)
public static org.apache.avro.Schema removeFields(org.apache.avro.Schema schema,
Set<String> fieldsToRemove)
public static org.apache.avro.Schema makeFieldNonNull(org.apache.avro.Schema schema,
String fieldName,
Object fieldDefaultValue)
public static org.apache.avro.Schema getRecordKeySchema()
public static org.apache.avro.Schema getRecordKeyPartitionPathSchema()
public static org.apache.avro.Schema getSchemaForFields(org.apache.avro.Schema fileSchema,
List<String> fields)
public static org.apache.avro.generic.GenericRecord addHoodieKeyToRecord(org.apache.avro.generic.GenericRecord record,
String recordKey,
String partitionPath,
String fileName)
public static org.apache.avro.generic.GenericRecord addOperationToRecord(org.apache.avro.generic.GenericRecord record,
HoodieOperation operation)
public static org.apache.avro.generic.GenericRecord addCommitMetadataToRecord(org.apache.avro.generic.GenericRecord record,
String instantTime,
String commitSeqno)
public static org.apache.avro.generic.GenericRecord stitchRecords(org.apache.avro.generic.GenericRecord left,
org.apache.avro.generic.GenericRecord right,
org.apache.avro.Schema stitchedSchema)
public static org.apache.avro.generic.GenericRecord rewriteRecord(org.apache.avro.generic.GenericRecord oldRecord,
org.apache.avro.Schema newSchema)
NOTE: This method is rewriting every record's field that is record itself recursively. It's caller's responsibility to make sure that no unnecessary re-writing occurs (by preemptively checking whether the record does require re-writing to adhere to the new schema)
NOTE: Here, the assumption is that you cannot go from an evolved schema (schema with (N) fields) to an older schema (schema with (N-1) fields). All fields present in the older record schema MUST be present in the new schema and the default/existing values are carried over.
This particular method does the following:
For SpecificRecord we ignore Hudi Metadata fields, because for code generated avro classes (HoodieMetadataRecord), the avro record is a SpecificBaseRecord type instead of a GenericRecord. SpecificBaseRecord throws null pointer exception for record.get(name) if name is not present in the schema of the record (which happens when converting a SpecificBaseRecord without hoodie_metadata_fields to a new record with it). In this case, we do NOT set the defaults for the hoodie_metadata_fields explicitly, instead, the new record assumes the default defined in the avro schema itself. TODO: See if we can always pass GenericRecord instead of SpecificBaseRecord in some cases.
public static org.apache.avro.generic.GenericRecord rewriteRecordWithMetadata(org.apache.avro.generic.GenericRecord genericRecord,
org.apache.avro.Schema newSchema,
String fileName)
public static org.apache.avro.generic.GenericRecord rewriteEvolutionRecordWithMetadata(org.apache.avro.generic.GenericRecord genericRecord,
org.apache.avro.Schema newSchema,
String fileName)
public static List<org.apache.avro.generic.GenericRecord> rewriteRecords(List<org.apache.avro.generic.GenericRecord> records, org.apache.avro.Schema newSchema)
GenericRecord provided into the GenericRecord adhering to the
provided newSchema.
To better understand conversion rules please check rewriteRecord(GenericRecord, Schema)
public static org.apache.avro.generic.GenericRecord removeFields(org.apache.avro.generic.GenericRecord record,
Set<String> fieldsToRemove)
To better understand how it removes please check rewriteRecord(GenericRecord, Schema)
public static org.apache.avro.Schema generateProjectionSchema(org.apache.avro.Schema originalSchema,
List<String> fieldNames)
public static String getRootLevelFieldName(String fieldName)
fieldName - The field name.public static Object getFieldVal(org.apache.avro.generic.GenericRecord record, String key)
public static Object getFieldVal(org.apache.avro.generic.GenericRecord record, String key, boolean returnNullIfNotFound)
public static String getNestedFieldValAsString(org.apache.avro.generic.GenericRecord record, String fieldName, boolean returnNullIfNotFound, boolean consistentLogicalTimestampEnabled)
public static Object getNestedFieldVal(org.apache.avro.generic.GenericRecord record, String fieldName, boolean returnNullIfNotFound, boolean consistentLogicalTimestampEnabled)
public static org.apache.avro.Schema getNestedFieldSchemaFromRecord(org.apache.avro.generic.GenericRecord record,
String fieldName)
record - - record containing the value of the given fieldfieldName - - name of the fieldpublic static org.apache.avro.Schema getNestedFieldSchemaFromWriteSchema(org.apache.avro.Schema writeSchema,
String fieldName)
getNestedFieldSchemaFromRecord(GenericRecord, String)writeSchema - - write schema of the recordfieldName - - name of the fieldpublic static Option<String> getNullableValAsString(org.apache.avro.generic.GenericRecord rec, String fieldName)
rec and field fieldName.
The field and value both could be missing.rec - The recordfieldName - The field namepublic static Object convertValueForSpecificDataTypes(org.apache.avro.Schema fieldSchema, Object fieldValue, boolean consistentLogicalTimestampEnabled)
fieldSchema - avro field schemafieldValue - avro field valuepublic static org.apache.avro.Schema getNullSchema()
public static String sanitizeName(String name)
name - input namepublic static String sanitizeName(String name, String invalidCharMask)
name - input nameinvalidCharMask - replacement for invalid characters.public static Object[] getRecordColumnValues(HoodieRecord record, String[] columns, org.apache.avro.Schema schema, boolean consistentLogicalTimestampEnabled)
record - Hoodie record.columns - Names of the columns to get values.schema - Schema instance.public static Object getRecordColumnValues(HoodieRecord record, String[] columns, SerializableSchema schema, boolean consistentLogicalTimestampEnabled)
record - Hoodie record.columns - Names of the columns to get values.schema - SerializableSchema instance.public static org.apache.avro.generic.GenericRecord rewriteRecordWithNewSchema(org.apache.avro.generic.IndexedRecord oldRecord,
org.apache.avro.Schema newSchema)
public static org.apache.avro.generic.GenericRecord rewriteRecordWithNewSchema(org.apache.avro.generic.IndexedRecord oldRecord,
org.apache.avro.Schema newSchema,
Map<String,String> renameCols)
oldRecord - oldRecord to be rewrittennewSchema - newSchema used to rewrite oldRecordrenameCols - a map store all rename cols, (k, v)-> (colNameFromNewSchema, colNameFromOldSchema)public static org.apache.avro.generic.GenericRecord rewriteRecordWithNewSchema(org.apache.avro.generic.IndexedRecord oldRecord,
org.apache.avro.Schema newSchema,
Map<String,String> renameCols,
boolean validate)
public static boolean recordNeedsRewriteForExtendedAvroTypePromotion(org.apache.avro.Schema writerSchema,
org.apache.avro.Schema readerSchema)
public static Date toJavaDate(int days)
NOTE: This method could only be used in tests
public static int fromJavaDate(Date date)
NOTE: This method could only be used in tests
public static HoodieRecord createHoodieRecordFromAvro(org.apache.avro.generic.IndexedRecord data, String payloadClass, String preCombineField, Option<Pair<String,String>> simpleKeyGenFieldsOpt, Boolean withOperation, Option<String> partitionNameOp, Boolean populateMetaFields, Option<org.apache.avro.Schema> schemaWithoutMetaFields)
public static Iterator<org.apache.avro.generic.GenericRecord> rewriteRecordWithNewSchema(Iterator<org.apache.avro.generic.GenericRecord> oldRecords, org.apache.avro.Schema newSchema, Map<String,String> renameCols, boolean validate)
oldRecords - oldRecords to be rewrittennewSchema - newSchema used to rewrite oldRecordrenameCols - a map store all rename cols, (k, v)-> (colNameFromNewSchema, colNameFromOldSchema)public static Iterator<org.apache.avro.generic.GenericRecord> rewriteRecordWithNewSchema(Iterator<org.apache.avro.generic.GenericRecord> oldRecords, org.apache.avro.Schema newSchema, Map<String,String> renameCols)
public static org.apache.avro.generic.GenericRecord rewriteRecordDeep(org.apache.avro.generic.GenericRecord oldRecord,
org.apache.avro.Schema newSchema)
public static org.apache.avro.generic.GenericRecord rewriteRecordDeep(org.apache.avro.generic.GenericRecord oldRecord,
org.apache.avro.Schema newSchema,
boolean validate)
public static boolean gteqAvro1_9()
public static boolean gteqAvro1_10()
public static Object wrapValueIntoAvro(Comparable<?> value)
value - Java value.public static Comparable<?> unwrapAvroValueWrapper(Object avroValueWrapper)
avroValueWrapper - A wrapped value with Avro type wrapper.Copyright © 2024 The Apache Software Foundation. All rights reserved.