flexflow · lockshaw · Sep 16, 2024 · Sep 9, 2024 · Sep 9, 2024 · Sep 12, 2024
diff --git a/lib/kernels/include/kernels/legion_dim.h b/lib/kernels/include/kernels/legion_dim.h
@@ -2,7 +2,7 @@
 #define _FLEXFLOW_KERNELS_INCLUDE_KERNELS_LEGION_DIM_H
 
 #include "kernels/legion_dim_t.dtg.h"
-#include "op-attrs/dim_ordered.h"
+#include "op-attrs/dim_ordered/dim_ordered.h"
 
 namespace FlexFlow {
 

diff --git a/lib/local-execution/include/local-execution/serialization.h b/lib/local-execution/include/local-execution/serialization.h
@@ -3,7 +3,7 @@
 
 #include "kernels/device.h"
 #include "kernels/nccl.h"
-#include "op-attrs/dim_ordered.h"
+#include "op-attrs/dim_ordered/dim_ordered.h"
 #include "utils/required.h"
 #include "utils/strong_typedef.h"
 #include "utils/type_traits.h"

diff --git a/lib/local-execution/src/local_cost_estimator.cc b/lib/local-execution/src/local_cost_estimator.cc
@@ -51,7 +51,7 @@
   for (ParallelTensorShape const &input : inputs) {
     TensorShape tensor_shape = get_piece_shape(input);
     tensor_guid_t tensor_id =
-        cg_builder.create_tensor(tensor_shape, CreateGrad::YES);
+        cg_builder.create_input(tensor_shape, CreateGrad::YES);
     GenericTensorAccessorW tensor_backing =
         allocator.allocate_tensor(tensor_shape);
     tensor_backing_map.insert({tensor_id, tensor_backing});
@@ -69,7 +69,10 @@
   std::vector<tensor_guid_t> output_tensor_ids =
       cg_builder.add_layer(layer_attrs,
                            input_tensor_ids,
-                           get_vector_piece_attrs(weights),
+                           transform(get_vector_piece_attrs(weights),
+                                     [&](TensorAttrs const &a) {
+                                       return cg_builder.create_weight(a);
+                                     }),
                            get_vector_piece_attrs(outputs));
 
   LocalTrainingBacking local_backing(allocator,

diff --git a/lib/local-execution/test/src/test_local_slots_backing.cc b/lib/local-execution/test/src/test_local_slots_backing.cc
@@ -37,11 +37,11 @@ TEST_SUITE(FF_TEST_SUITE) {
     // build graph
     ComputationGraphBuilder cg_builder;
     tensor_guid_t query_guid =
-        cg_builder.create_tensor(query_shape, CreateGrad::YES);
+        cg_builder.create_input(query_shape, CreateGrad::YES);
     tensor_guid_t key_guid =
-        cg_builder.create_tensor(key_shape, CreateGrad::YES);
+        cg_builder.create_input(key_shape, CreateGrad::YES);
     tensor_guid_t value_guid =
-        cg_builder.create_tensor(value_shape, CreateGrad::YES);
+        cg_builder.create_input(value_shape, CreateGrad::YES);
 
     std::string layer_name = "attn1";
     tensor_guid_t output_guid =

diff --git a/lib/models/src/models/transformer.cc b/lib/models/src/models/transformer.cc
@@ -42,7 +42,8 @@ tensor_guid_t create_transformer_encoder_layer(ComputationGraphBuilder &cgb,
                                                          config.num_heads,
                                                          kdim,
                                                          vdim,
-                                                         config.dropout);
+                                                         config.dropout,
+                                                         /*bias=*/false);
   assert(are_tensor_guid_shapes_equivalent(
       cgb.computation_graph, input, self_attention));
 
@@ -88,7 +89,8 @@ tensor_guid_t
                                                          config.num_heads,
                                                          kdim,
                                                          vdim,
-                                                         config.dropout);
+                                                         config.dropout,
+                                                         /*bias=*/false);
   assert(are_tensor_guid_shapes_equivalent(
       cgb.computation_graph, input, self_attention));
 
@@ -107,7 +109,8 @@ tensor_guid_t
                                               config.num_heads,
                                               kdim,
                                               vdim,
-                                              config.dropout);
+                                              config.dropout,
+                                              /*bias=*/false);
   assert(are_tensor_guid_shapes_equivalent(cgb.computation_graph, input, mha));
 
   tensor_guid_t mha_normalized =
@@ -149,7 +152,7 @@ ComputationGraph
           config.batch_size, config.sequence_length, config.num_features}},
       DataType::FLOAT,
   };
-  tensor_guid_t input = cgb.create_tensor(input_shape, CreateGrad::YES);
+  tensor_guid_t input = cgb.create_input(input_shape, CreateGrad::YES);
 
   tensor_guid_t encoder_output = create_transformer_encoder(cgb, config, input);
   tensor_guid_t decoder_output =

diff --git a/lib/op-attrs/include/op-attrs/computation_graph_op_attrs.h b/lib/op-attrs/include/op-attrs/computation_graph_op_attrs.h
@@ -2,12 +2,15 @@
 #define _FLEXFLOW_LIB_OP_ATTRS_INCLUDE_OP_ATTRS_COMPUTATION_GRAPH_OP_ATTRS_H
 
 #include "op-attrs/computation_graph_op_attrs.dtg.h"
+#include "op-attrs/pcg_operator_attrs.dtg.h"
 #include "utils/record_formatter.h"
 
 namespace FlexFlow {
 
 OperatorType get_op_type(ComputationGraphOpAttrs const &);
 RecordFormatter as_dot(ComputationGraphOpAttrs const &);
+ComputationGraphOpAttrs
+    compgraph_op_attrs_from_pcg_op_attrs(PCGOperatorAttrs const &);
 
 } // namespace FlexFlow
 

diff --git a/lib/op-attrs/include/op-attrs/dim_ordered.h → ...nclude/op-attrs/dim_ordered/dim_ordered.h b/lib/op-attrs/include/op-attrs/dim_ordered.h → ...nclude/op-attrs/dim_ordered/dim_ordered.h
diff --git a/lib/op-attrs/include/op-attrs/dim_ordered/enumerate.h b/lib/op-attrs/include/op-attrs/dim_ordered/enumerate.h
@@ -1,7 +1,7 @@
 #ifndef _FLEXFLOW_LIB_OP_ATTRS_INCLUDE_OP_ATTRS_DIM_ORDERED_ENUMERATE_H
 #define _FLEXFLOW_LIB_OP_ATTRS_INCLUDE_OP_ATTRS_DIM_ORDERED_ENUMERATE_H
 
-#include "op-attrs/dim_ordered.h"
+#include "op-attrs/dim_ordered/dim_ordered.h"
 #include "utils/bidict/bidict.h"
 #include "utils/containers/count.h"
 

diff --git a/lib/op-attrs/include/op-attrs/dim_ordered/ff_ordered_of.h b/lib/op-attrs/include/op-attrs/dim_ordered/ff_ordered_of.h
@@ -1,7 +1,7 @@
 #ifndef _FLEXFLOW_LIB_OP_ATTRS_INCLUDE_OP_ATTRS_DIM_ORDERED_FF_ORDERED_OF_H
 #define _FLEXFLOW_LIB_OP_ATTRS_INCLUDE_OP_ATTRS_DIM_ORDERED_FF_ORDERED_OF_H
 
-#include "op-attrs/dim_ordered.h"
+#include "op-attrs/dim_ordered/dim_ordered.h"
 
 namespace FlexFlow {
 

diff --git a/lib/op-attrs/include/op-attrs/dim_ordered/get_idxs.h b/lib/op-attrs/include/op-attrs/dim_ordered/get_idxs.h
@@ -1,7 +1,7 @@
 #ifndef _FLEXFLOW_LIB_OP_ATTRS_INCLUDE_OP_ATTRS_DIM_ORDERED_GET_IDXS_H
 #define _FLEXFLOW_LIB_OP_ATTRS_INCLUDE_OP_ATTRS_DIM_ORDERED_GET_IDXS_H
 
-#include "op-attrs/dim_ordered.h"
+#include "op-attrs/dim_ordered/dim_ordered.h"
 #include "utils/containers/count.h"
 #include "utils/containers/transform.h"
 

diff --git a/lib/op-attrs/include/op-attrs/dim_ordered/slice.h b/lib/op-attrs/include/op-attrs/dim_ordered/slice.h
@@ -1,7 +1,7 @@
 #ifndef _FLEXFLOW_LIB_OP_ATTRS_INCLUDE_OP_ATTRS_DIM_ORDERED_SLICE_H
 #define _FLEXFLOW_LIB_OP_ATTRS_INCLUDE_OP_ATTRS_DIM_ORDERED_SLICE_H
 
-#include "op-attrs/dim_ordered.h"
+#include "op-attrs/dim_ordered/dim_ordered.h"
 #include "utils/containers/as_vector.h"
 #include "utils/containers/subvec.h"
 #include "utils/containers/transform.h"

diff --git a/lib/op-attrs/include/op-attrs/dim_ordered/transform.h b/lib/op-attrs/include/op-attrs/dim_ordered/transform.h
@@ -1,7 +1,7 @@
 #ifndef _FLEXFLOW_LIB_OP_ATTRS_INCLUDE_OP_ATTRS_DIM_ORDERED_TRANSFORM_H
 #define _FLEXFLOW_LIB_OP_ATTRS_INCLUDE_OP_ATTRS_DIM_ORDERED_TRANSFORM_H
 
-#include "op-attrs/dim_ordered.h"
+#include "op-attrs/dim_ordered/dim_ordered.h"
 #include "utils/containers/as_vector.h"
 #include "utils/containers/vector_transform.h"
 

diff --git a/lib/op-attrs/include/op-attrs/dim_ordered/zip.h b/lib/op-attrs/include/op-attrs/dim_ordered/zip.h
@@ -1,7 +1,7 @@
 #ifndef _FLEXFLOW_LIB_OP_ATTRS_INCLUDE_OP_ATTRS_DIM_ORDERED_ZIP_H
 #define _FLEXFLOW_LIB_OP_ATTRS_INCLUDE_OP_ATTRS_DIM_ORDERED_ZIP_H
 
-#include "op-attrs/dim_ordered.h"
+#include "op-attrs/dim_ordered/dim_ordered.h"
 #include "utils/containers/as_vector.h"
 #include "utils/containers/zip.h"
 

diff --git a/lib/op-attrs/include/op-attrs/get_incoming_tensor_roles.h b/lib/op-attrs/include/op-attrs/get_incoming_tensor_roles.h
@@ -0,0 +1,17 @@
+#ifndef _FLEXFLOW_LIB_OP_ATTRS_INCLUDE_OP_ATTRS_GET_INCOMING_TENSOR_ROLES_H
+#define _FLEXFLOW_LIB_OP_ATTRS_INCLUDE_OP_ATTRS_GET_INCOMING_TENSOR_ROLES_H
+
+#include "op-attrs/computation_graph_op_attrs.dtg.h"
+#include "op-attrs/incoming_tensor_role.dtg.h"
+#include "op-attrs/pcg_operator_attrs.dtg.h"
+
+namespace FlexFlow {
+
+std::vector<IncomingTensorRole>
+    get_incoming_tensor_roles(ComputationGraphOpAttrs const &, int num_inputs);
+std::vector<IncomingTensorRole>
+    get_incoming_tensor_roles(PCGOperatorAttrs const &, int num_inputs);
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/op-attrs/include/op-attrs/incoming_tensor_role.enum.toml b/lib/op-attrs/include/op-attrs/incoming_tensor_role.enum.toml
@@ -0,0 +1,14 @@
+namespace = "FlexFlow"
+name = "IncomingTensorRole"
+features = [
+  "hash",
+  "fmt",
+  "rapidcheck",
+  "json",
+]
+
+[[values]]
+name = "INPUT"
+
+[[values]]
+name = "WEIGHT"
diff --git a/lib/op-attrs/include/op-attrs/ops/attention.h b/lib/op-attrs/include/op-attrs/ops/attention.h
@@ -1,6 +1,7 @@
 #ifndef _FLEXFLOW_ATTENTION_ATTRS_H
 #define _FLEXFLOW_ATTENTION_ATTRS_H
 
+#include "op-attrs/incoming_tensor_role.dtg.h"
 #include "op-attrs/ops/attention/multihead_attention_inputs.dtg.h"
 #include "op-attrs/ops/attention/multihead_attention_parallel_inputs.dtg.h"
 #include "op-attrs/ops/attention_attrs.dtg.h"
@@ -37,6 +38,9 @@ int get_kvSeqLength(MultiHeadAttentionInputs const &);
 int get_num_samples(MultiHeadAttentionParallelInputs const &);
 int get_num_samples(MultiHeadAttentionInputs const &);
 
+std::vector<IncomingTensorRole>
+    get_attention_incoming_tensor_roles(MultiHeadAttentionAttrs const &);
+
 tl::expected<TensorShape, std::string>
     get_weights_shape(MultiHeadAttentionAttrs const &,
                       TensorShape const &input_q,
@@ -58,6 +62,22 @@ tl::expected<TensorShape, std::string>
                      TensorShape const &input_k,
                      TensorShape const &input_v);
 
+tl::expected<ParallelTensorDims, std::string>
+    get_weights_parallel_dims(MultiHeadAttentionAttrs const &,
+                              ParallelTensorShape const &input_q,
+                              ParallelTensorShape const &input_k,
+                              ParallelTensorShape const &input_v);
+tl::expected<ParallelTensorDims, std::string>
+    get_input_bias_parallel_dims(MultiHeadAttentionAttrs const &,
+                                 ParallelTensorShape const &input_q,
+                                 ParallelTensorShape const &input_k,
+                                 ParallelTensorShape const &input_v);
+tl::expected<ParallelTensorDims, std::string>
+    get_output_bias_parallel_dims(MultiHeadAttentionAttrs const &,
+                                  ParallelTensorShape const &input_q,
+                                  ParallelTensorShape const &input_k,
+                                  ParallelTensorShape const &input_v);
+
 tl::expected<ParallelTensorShape, std::string>
     get_weights_shape(MultiHeadAttentionAttrs const &,
                       ParallelTensorShape const &input_q,

diff --git a/lib/op-attrs/include/op-attrs/ops/batch_matmul.h b/lib/op-attrs/include/op-attrs/ops/batch_matmul.h
@@ -2,12 +2,15 @@
 #define _FLEXFLOW_LIB_OP_ATTRS_INCLUDE_OP_ATTRS_OPS_BATCH_MATMUL_H
 
 #include "op-attrs/ops/batch_matmul.dtg.h"
+#include "op-attrs/ops/core.h"
 #include "op-attrs/parallel_tensor_shape.dtg.h"
 #include "op-attrs/tensor_shape.dtg.h"
 #include <tl/expected.hpp>
 
 namespace FlexFlow {
 
+CHECK_VALID_OP_ATTR(BatchMatmulAttrs);
+
 bool is_valid(BatchMatmulAttrs const &,
               ParallelTensorShape const &,
               ParallelTensorShape const &);

diff --git a/lib/op-attrs/include/op-attrs/ops/conv_2d.h b/lib/op-attrs/include/op-attrs/ops/conv_2d.h
@@ -1,6 +1,7 @@
 #ifndef _FLEXFLOW_CONV_2D_ATTRS_H
 #define _FLEXFLOW_CONV_2D_ATTRS_H
 
+#include "op-attrs/incoming_tensor_role.dtg.h"
 #include "op-attrs/ops/conv_2d_attrs.dtg.h"
 #include "op-attrs/ops/core.h"
 #include "op-attrs/parallel_tensor_shape.h"
@@ -10,6 +11,9 @@ namespace FlexFlow {
 
 CHECK_VALID_OP_ATTR(Conv2DAttrs);
 
+std::vector<IncomingTensorRole>
+    get_conv2d_incoming_tensor_roles(Conv2DAttrs const &);
+
 TensorShape get_kernel_shape(Conv2DAttrs const &attrs,
                              TensorShape const &input);
 TensorShape get_bias_shape(Conv2DAttrs const &attrs, TensorShape const &input);

diff --git a/lib/op-attrs/include/op-attrs/ops/layer_norm.h b/lib/op-attrs/include/op-attrs/ops/layer_norm.h
@@ -1,13 +1,17 @@
 #ifndef _FLEXFLOW_OP_META_OPS_LAYER_NORM_ATTRS_H
 #define _FLEXFLOW_OP_META_OPS_LAYER_NORM_ATTRS_H
 
+#include "op-attrs/incoming_tensor_role.dtg.h"
 #include "op-attrs/ops/core.h"
 #include "op-attrs/ops/layer_norm_attrs.dtg.h"
 #include "op-attrs/parallel_tensor_shape.dtg.h"
 #include "op-attrs/tensor_shape.dtg.h"
 
 namespace FlexFlow {
 
+std::vector<IncomingTensorRole>
+    get_layer_norm_incoming_tensor_roles(LayerNormAttrs const &);
+
 tl::expected<TensorShape, std::string> get_output_shape(LayerNormAttrs const &,
                                                         TensorShape const &);
 tl::expected<TensorShape, std::string>

diff --git a/lib/op-attrs/include/op-attrs/ops/linear.h b/lib/op-attrs/include/op-attrs/ops/linear.h
@@ -1,6 +1,7 @@
 #ifndef _FLEXFLOW_LINEAR_ATTRS_H
 #define _FLEXFLOW_LINEAR_ATTRS_H
 
+#include "op-attrs/incoming_tensor_role.dtg.h"
 #include "op-attrs/ops/core.h"
 #include "op-attrs/ops/linear_attrs.dtg.h"
 #include "op-attrs/parallel_tensor_shape.dtg.h"
@@ -10,20 +11,23 @@
 
 namespace FlexFlow {
 
+std::vector<IncomingTensorRole>
+    get_linear_incoming_tensor_roles(LinearAttrs const &);
+
 CHECK_VALID_OP_ATTR(LinearAttrs);
 
 RecordFormatter as_dot(LinearAttrs const &);
 
 tl::expected<TensorShape, std::string>
-    get_kernel_shape(LinearAttrs const &attrs, TensorShape const &input);
+    get_projection_shape(LinearAttrs const &attrs, TensorShape const &input);
 tl::expected<TensorShape, std::string> get_bias_shape(LinearAttrs const &attrs,
                                                       TensorShape const &input);
 tl::expected<TensorShape, std::string>
     get_output_shape(LinearAttrs const &attrs, TensorShape const &input);
 
 tl::expected<ParallelTensorShape, std::string>
-    get_kernel_shape(LinearAttrs const &attrs,
-                     ParallelTensorShape const &input);
+    get_projection_shape(LinearAttrs const &attrs,
+                         ParallelTensorShape const &input);
 tl::expected<ParallelTensorShape, std::string>
     get_bias_shape(LinearAttrs const &attrs, ParallelTensorShape const &input);
 tl::expected<ParallelTensorShape, std::string>

diff --git a/lib/op-attrs/include/op-attrs/ops/topk.h b/lib/op-attrs/include/op-attrs/ops/topk.h
@@ -4,11 +4,13 @@
 #include "op-attrs/ops/core.h"
 #include "op-attrs/ops/topk_attrs.dtg.h"
 #include "op-attrs/parallel_tensor_shape.dtg.h"
+#include "op-attrs/tensor_shape.dtg.h"
 
 namespace FlexFlow {
 
 CHECK_VALID_OP_ATTR(TopKAttrs);
 
+TensorShape get_output_shape(TopKAttrs const &, TensorShape const &);
 ParallelTensorShape get_output_shape(TopKAttrs const &attrs,
                                      ParallelTensorShape const &input_shape);
 

diff --git a/lib/op-attrs/include/op-attrs/ops/transpose_attrs.struct.toml b/lib/op-attrs/include/op-attrs/ops/transpose_attrs.struct.toml
@@ -12,7 +12,7 @@ features = [
 includes = [
   "op-attrs/ff_dim.h",
   "op-attrs/ff_dim.dtg.h",
-  "op-attrs/dim_ordered.h",
+  "op-attrs/dim_ordered/dim_ordered.h",
 ]
 
 [[fields]]

diff --git a/lib/op-attrs/include/op-attrs/parallel_tensor_dims.struct.toml b/lib/op-attrs/include/op-attrs/parallel_tensor_dims.struct.toml
@@ -10,7 +10,7 @@ features = [
 ]
 
 includes = [
-  "op-attrs/dim_ordered.h",
+  "op-attrs/dim_ordered/dim_ordered.h",
   "op-attrs/shard_parallel_dim.dtg.h",
   "op-attrs/replica_parallel_dim_set.dtg.h",
   "<unordered_map>",

diff --git a/lib/op-attrs/include/op-attrs/pcg_operator_attrs.h b/lib/op-attrs/include/op-attrs/pcg_operator_attrs.h
@@ -8,8 +8,8 @@ namespace FlexFlow {
 
 bool is_parallel_op(PCGOperatorAttrs const &);
 OperatorType get_op_type(PCGOperatorAttrs const &);
-ComputationGraphOpAttrs
-    compgraph_op_attrs_from_pcg_op_attrs(PCGOperatorAttrs const &);
+PCGOperatorAttrs
+    pcg_op_attrs_from_compgraph_op_attrs(ComputationGraphOpAttrs const &);
 RecordFormatter as_dot(PCGOperatorAttrs const &);
 
 } // namespace FlexFlow

diff --git a/lib/op-attrs/include/op-attrs/pcg_operator_attrs.variant.toml b/lib/op-attrs/include/op-attrs/pcg_operator_attrs.variant.toml
@@ -13,6 +13,7 @@ includes = [
   "op-attrs/ops/attention_attrs.dtg.h", 
   "op-attrs/ops/batch_matmul.dtg.h", 
   "op-attrs/ops/batch_norm_attrs.dtg.h", 
+  "op-attrs/ops/broadcast_attrs.dtg.h", 
   "op-attrs/ops/cast_attrs.dtg.h", 
   "op-attrs/ops/combine_attrs.dtg.h", 
   "op-attrs/ops/concat_attrs.dtg.h", 
@@ -49,6 +50,10 @@ key = "batch_matmul"
 type = "::FlexFlow::BatchNormAttrs"
 key = "batch_norm"
 
+[[values]]
+type = "::FlexFlow::BroadcastAttrs"
+key = "broadcast"
+
 [[values]]
 type = "::FlexFlow::CastAttrs"
 key = "cast"

diff --git a/lib/op-attrs/include/op-attrs/tensor_dims.struct.toml b/lib/op-attrs/include/op-attrs/tensor_dims.struct.toml
@@ -9,7 +9,7 @@ features = [
   "fmt",
 ]
 includes = [
-  "op-attrs/dim_ordered.h",
+  "op-attrs/dim_ordered/dim_ordered.h",
 ]
 
 [[fields]]