roboto.domain.topics.parquet.parquet_parser#

Module Contents#

class roboto.domain.topics.parquet.parquet_parser.ParquetParser(source, min_required_row_group_size=100000, small_row_group_count_threshold=32)#
Parameters:
  • source (pathlib.Path)

  • min_required_row_group_size (int)

  • small_row_group_count_threshold (int)

property column_count: int#
Return type:

int

extract_timestamp_info(timestamp_column_name=None, timestamp_unit=None)#
Parameters:
Return type:

roboto.domain.topics.parquet.timestamp.TimestampInfo

property fields: Generator[pyarrow.Field, None, None]#
Return type:

Generator[pyarrow.Field, None, None]

find_timestamp_field_by_type()#
Return type:

pyarrow.Field

get_data_for_column(column_name)#
Parameters:

column_name (str)

Return type:

pyarrow.Table

get_timestamp_field_by_name(column_name)#
Parameters:

column_name (str)

Return type:

pyarrow.Field

static is_parquet_file(path)#
Parameters:

path (pathlib.Path)

Return type:

bool

requires_rewrite(timestamp)#
Parameters:

timestamp (roboto.domain.topics.parquet.timestamp.TimestampInfo)

Return type:

bool

rewrite(outfile, timestamp, target_row_group_size_bytes=100 * 1000 * 1000)#
Parameters:
Return type:

None

property row_count: int#
Return type:

int

property row_group_count: int#
Return type:

int

property row_group_size: int#
Return type:

int

roboto.domain.topics.parquet.parquet_parser.logger#