1// Generic representation of tree-based models.
2
3// This proto establishes a shared standard: "fully compatible" projects should
4// provide support for all reasonable models expressed through it. Therefore,
5// it should be kept as simple as possible, and should never contain
6// project-specific design choices.
7
8// Status: work in progress. This proto can change anytime without notice.
9
10syntax = "proto3";
11option cc_enable_arenas = true;
12
13package tensorflow.decision_trees;
14
15import "google/protobuf/any.proto";
16import "google/protobuf/wrappers.proto";
17
18// A generic handle for any type of model.
19message Model {
20  oneof model {
21    DecisionTree decision_tree = 1;
22    Ensemble ensemble = 2;
23    google.protobuf.Any custom_model = 3;
24  }
25  repeated google.protobuf.Any additional_data = 4;
26}
27
28message ModelAndFeatures {
29  message Feature {
30    // TODO(jonasz): Remove this field, as it's confusing. Ctx: cr/153569450.
31    FeatureId feature_id = 1 [deprecated = true];
32    repeated google.protobuf.Any additional_data = 2;
33  };
34  // Given a FeatureId feature_id, the feature's description is in
35  // features[feature_id.id.value].
36  map<string, Feature> features = 1;
37  Model model = 2;
38  repeated google.protobuf.Any additional_data = 3;
39}
40
41// An ordered sequence of models. This message can be used to express bagged or
42// boosted models, as well as custom ensembles.
43message Ensemble {
44  message Member {
45    Model submodel = 1;
46    google.protobuf.Int32Value submodel_id = 2;
47    repeated google.protobuf.Any additional_data = 3;
48  }
49  repeated Member members = 100; // A higher id for more readable printing.
50
51  // The presence of a certain combination_technique indicates how to combine
52  // the outputs of member models in order to compute the ensemble's output.
53  oneof combination_technique {
54    Summation summation_combination_technique = 1;
55    Averaging averaging_combination_technique = 2;
56    google.protobuf.Any custom_combination_technique = 3;
57  }
58  repeated google.protobuf.Any additional_data = 4;
59}
60
61// When present, the Ensemble's output is the sum of member models' outputs.
62message Summation {
63  repeated google.protobuf.Any additional_data = 1;
64};
65
66
67// When present, the Ensemble's output is the average of member models' outputs.
68message Averaging {
69  repeated google.protobuf.Any additional_data = 1;
70};
71
72
73message DecisionTree {
74  repeated TreeNode nodes = 1;
75  repeated google.protobuf.Any additional_data = 2;
76};
77
78
79message TreeNode {
80  // Following fields are provided for convenience and better readability.
81  // Filling them in is not required.
82  google.protobuf.Int32Value node_id = 1;
83  google.protobuf.Int32Value depth = 2;
84  google.protobuf.Int32Value subtree_size = 3;
85
86  oneof node_type {
87    BinaryNode binary_node = 4;
88    Leaf leaf = 5;
89    google.protobuf.Any custom_node_type = 6;
90  }
91
92  repeated google.protobuf.Any additional_data = 7;
93}
94
95
96message BinaryNode {
97  google.protobuf.Int32Value left_child_id = 1;
98  google.protobuf.Int32Value right_child_id = 2;
99  enum Direction {
100    LEFT = 0;
101    RIGHT = 1;
102  }
103  // When left_child_test is undefined for a particular datapoint (e.g. because
104  // it's not defined when feature value is missing), the datapoint should go
105  // in this direction.
106  Direction default_direction = 3;
107  // When a datapoint satisfies the test, it should be propagated to the left
108  // child.
109  oneof left_child_test {
110    InequalityTest inequality_left_child_test = 4;
111    google.protobuf.Any custom_left_child_test = 5;
112  }
113};
114
115// A SparseVector represents a vector in which only certain select elements
116// are non-zero.  Maps labels to values (e.g. class id to probability or count).
117message SparseVector {
118  map<int64, Value> sparse_value = 1;
119}
120
121message Vector {
122  repeated Value value = 1;
123}
124
125message Leaf {
126  oneof leaf {
127    // The interpretation of the values held in the leaves of a decision tree
128    // is application specific, but some common cases are:
129    // 1) len(vector) = 1, and the floating point value[0] holds the class 0
130    //    probability in a two class classification problem.
131    // 2) len(vector) = 1, and the integer value[0] holds the class prediction.
132    // 3) The floating point value[i] holds the class i probability prediction.
133    // 4) The floating point value[i] holds the i-th component of the
134    //    vector prediction in a regression problem.
135    // 5) sparse_vector holds the sparse class predictions for a classification
136    //    problem with a large number of classes.
137    Vector vector = 1;
138    SparseVector sparse_vector = 2;
139  }
140  // For non-standard handling of leaves.
141  repeated google.protobuf.Any additional_data = 3;
142};
143
144
145message FeatureId {
146  google.protobuf.StringValue id = 1;
147  repeated google.protobuf.Any additional_data = 2;
148};
149
150message ObliqueFeatures {
151  // total value is sum(features[i] * weights[i]).
152  repeated FeatureId features = 1;
153  repeated float weights = 2;
154}
155
156
157message InequalityTest {
158  // When the feature is missing, the test's outcome is undefined.
159  oneof FeatureSum {
160    FeatureId feature_id = 1;
161    ObliqueFeatures oblique = 4;
162  }
163  enum Type {
164    LESS_OR_EQUAL = 0;
165    LESS_THAN = 1;
166    GREATER_OR_EQUAL = 2;
167    GREATER_THAN = 3;
168  };
169  Type type = 2;
170  Value threshold = 3;
171};
172
173
174// Represents a single value of any type, e.g. 5 or "abc".
175message Value {
176  oneof value {
177    float float_value = 1;
178    double double_value = 2;
179    int32 int32_value = 3;
180    int64 int64_value = 4;
181    google.protobuf.Any custom_value = 5;
182  }
183};
184