libcudf  23.12.00
parquet_metadata.hpp
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2023, NVIDIA CORPORATION.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
22 #pragma once
23 
24 #include <cudf/io/types.hpp>
25 
26 #include <optional>
27 #include <string_view>
28 #include <variant>
29 #include <vector>
30 
31 namespace cudf {
32 namespace io {
33 
34 namespace parquet {
38 enum class TypeKind : int8_t {
39  UNDEFINED_TYPE = -1, // Undefined for non-leaf nodes
40  BOOLEAN = 0,
41  INT32 = 1,
42  INT64 = 2,
43  INT96 = 3, // Deprecated
44  FLOAT = 4,
45  DOUBLE = 5,
46  BYTE_ARRAY = 6,
47  FIXED_LEN_BYTE_ARRAY = 7,
48 };
49 } // namespace parquet
50 
55  public:
63  parquet_column_schema(std::string_view name,
64  parquet::TypeKind type,
65  std::vector<parquet_column_schema> children)
66  : _name{name}, _type_kind{type}, _children{std::move(children)}
67  {
68  }
69 
75  [[nodiscard]] auto name() const { return _name; }
76 
82  [[nodiscard]] auto type_kind() const { return _type_kind; }
83 
89  [[nodiscard]] auto const& children() const& { return _children; }
90 
95  [[nodiscard]] auto children() && { return std::move(_children); }
96 
104  [[nodiscard]] auto const& child(int idx) const& { return children().at(idx); }
105 
110  [[nodiscard]] auto child(int idx) && { return std::move(children().at(idx)); }
111 
117  [[nodiscard]] auto num_children() const { return children().size(); }
118 
119  private:
120  std::string _name;
121  // 3 types available: Physical, Converted, Logical.
122  parquet::TypeKind _type_kind; // Physical
123  std::vector<parquet_column_schema> _children;
124 };
125 
130  public:
136  parquet_schema(parquet_column_schema root_column_schema) : _root{std::move(root_column_schema)} {}
137 
143  [[nodiscard]] auto const& root() const& { return _root; }
144 
149  [[nodiscard]] auto root() && { return std::move(_root); }
150 
151  private:
152  parquet_column_schema _root;
153 };
154 
159  public:
161  using key_value_metadata = std::unordered_map<std::string, std::string>;
162 
172  int64_t num_rows,
174  key_value_metadata file_metadata)
175  : _schema{std::move(schema)},
176  _num_rows{num_rows},
177  _num_rowgroups{num_rowgroups},
178  _file_metadata{std::move(file_metadata)}
179  {
180  }
181 
187  [[nodiscard]] auto const& schema() const { return _schema; }
188 
196  [[nodiscard]] auto num_rows() const { return _num_rows; }
197 
203  [[nodiscard]] auto num_rowgroups() const { return _num_rowgroups; }
209  [[nodiscard]] auto const& metadata() const { return _file_metadata; }
210 
211  private:
212  parquet_schema _schema;
213  int64_t _num_rows;
214  size_type _num_rowgroups;
215  key_value_metadata _file_metadata;
216 };
217 
229 
230 } // namespace io
231 } // namespace cudf
Information about content of a parquet file.
auto const & schema() const
Returns the parquet schema.
auto const & metadata() const
Returns the Key value metadata in the file footer.
auto num_rowgroups() const
Returns the number of rowgroups in the file.
parquet_metadata(parquet_schema schema, int64_t num_rows, size_type num_rowgroups, key_value_metadata file_metadata)
constructor
auto num_rows() const
Returns the number of rows of the root column.
std::unordered_map< std::string, std::string > key_value_metadata
Key-value metadata in the file footer.
parquet_metadata read_parquet_metadata(source_info const &src_info)
Reads metadata of parquet dataset.
int32_t size_type
Row index type for columns and tables.
Definition: types.hpp:80
cuDF-IO API type definitions
cuDF interfaces
Definition: aggregation.hpp:34
TypeKind
Basic data types in Parquet, determines how data is physically stored.
Schema of a parquet column, including the nested columns.
auto const & child(int idx) const &
Returns schema of the child with the given index.
auto name() const
Returns parquet column name; can be empty.
auto const & children() const &
Returns schemas of all child columns.
auto children() &&
Returns schemas of all child columns.
auto num_children() const
Returns the number of child columns.
auto type_kind() const
Returns parquet type of the column.
auto child(int idx) &&
Returns schema of the child with the given index.
parquet_column_schema(std::string_view name, parquet::TypeKind type, std::vector< parquet_column_schema > children)
constructor
Schema of a parquet file.
auto root() &&
Returns the schema of the struct column that contains all columns as fields.
auto const & root() const &
Returns the schema of the struct column that contains all columns as fields.
parquet_schema(parquet_column_schema root_column_schema)
constructor
Source information for read interfaces.
Definition: io/types.hpp:288