-
Notifications
You must be signed in to change notification settings - Fork 127
Open
Description
HYBRID_TRANSFORMER_FWD_IN_BCKWD model_parallel_NPU_group: 8 ep: 1 pp: 8 vpp: 12 ga: 32 all_gpus: 2048 checkpoints: 0 checkpoint_initiates: 0 pp_comm: 12582912.0
5
embedding_layer -1 197511169 ALLREDUCE 100663296 1 NONE 0 16732161 NONE 0 100
attention_column -1 3714498 ALLGATHER 100663296 3714498 REDUCESCATTER 0 3714498 NONE 0 100
attention_row -1 3714498 REDUCESCATTER 100663296 3714498 ALLGATHER 100663296 3714498 NONE 0 100
mlp_column -1 3080099 ALLGATHER 100663296 3080099 REDUCESCATTER 0 3080099 NONE 0 100
mlp_row -1 3080099 REDUCESCATTER 100663296 3080099 ALLGATHER 100663296 3080099 NONE 0 100
这里REDUCESCATTER的chunk_size为0
DataSet* Sys::generate_collective(
uint64_t size,
int layer_num,
LogicalTopology* topology,
std::vector<CollectiveImplementation*> implementation_per_dimension,
std::vector<bool> dimensions_involved,
ComType collective_type,
SchedulingPolicy pref_scheduling,
EventType event,
Callable* layer_ptr ) {
------BUG-----
uint64_t chunk_size = determine_chunk_size(size, collective_type);
if(id == 0) std::cout << "chunk size is: " << chunk_size << " , size is: " << size << " , layer_num is: " << layer_num << " , node: " << id << std::endl;
uint64_t recommended_chunk_size = chunk_size;
int streams = ceil(((double)size) / chunk_size);
int64_t tmp;
DataSet* dataset = new DataSet(streams);
------BUG-----
......
if (dataset->active) {
streams_injected += count;
dataset->total_streams = count;
}
return dataset;
}在/astra-sim-alibabacloud/astra-sim/system/Sys.cc:1431行开始,streams = ceil(((double)size) / chunk_size);计算会溢出,导致代码出错
astra-sim-alibabacloud/astra-sim/system/DataSet.cc的构造函数中也没有对total_streams==0进行特殊处理
Metadata
Metadata
Assignees
Labels
No labels