import numpy as np from tqdm import tqdm from helper_code import * from scipy import signal import pandas as pd from skmultilearn.model_selection import iterative_train_test_split import random import warnings import sys def get_nsamp(header): return int(header.split('\n')[0].split(' ')[3]) # Adapted from original scoring function code # For each set of equivalent classes, replace each class with the representative class for the set. def replace_equivalent_classes(classes, equivalent_classes): for j, x in enumerate(classes): for multiple_classes in equivalent_classes: if x in multiple_classes: classes[j] = multiple_classes[0] # Use the first class as the representative class. return classes ''' the 'output' will be using the zero-padding if the leads are less than 12.(8 leads) the 'output_leads' will be [1. 1. 0. 0. 0. 0. 1. 1. 1. 1. 1. 1.] the output will not be changed if the lead is 12 the 'output_leads' will be [1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.] ''' def mixup(data, mix_data): mix_lambda = np.random.beta(10, 10)# 这里利用beta分布得到一个0到1的值,这里如果可以,尝试别的分布函数 if data.shape[1] > mix_data.shape[1]: data = mix_lambda * data[:, :mix_data.shape[1]] + (1 - mix_lambda) * mix_data else: data = mix_lambda * data + (1 - mix_lambda) * mix_data[:, :data.shape[1]] return data, mix_lambda def cutmix(data, mix_data): cutmix_lambda = np.random.beta(10, 10) if data.shape[1] < mix_data.shape[1]: seq_length = data.shape[1] win_len = int(np.ceil(seq_length * cutmix_lambda)) start = np.random.randint(0, seq_length - win_len + 1) end = start + win_len data[:,start:end] = mix_data[:,start:end] else: seq_length = mix_data.shape[1] win_len = int(np.ceil(seq_length * cutmix_lambda)) start = np.random.randint(0, seq_length - win_len + 1) end = start + win_len data[:,start:end] = mix_data[:,start:end] return data, cutmix_lambda def cutmix_fix_length(data, mix_data): # cutmix_lambda = np.random.beta(0.2, 0.2) cutmix_lambda = 0.2 if data.shape[1] < mix_data.shape[1]: seq_length = data.shape[1] win_len = int(np.ceil(seq_length * cutmix_lambda)) start = np.random.randint(0, seq_length - win_len + 1) end = start + win_len data[:,start:end] = mix_data[:,start:end] else: seq_length = mix_data.shape[1] win_len = int(np.ceil(seq_length * cutmix_lambda)) start = np.random.randint(0, seq_length - win_len + 1) end = start + win_len data[:,start:end] = mix_data[:,start:end] return data, cutmix_lambda def same_shape_mixup(data, mix_data): if data.shape[1] == mix_data.shape[1]: mix_lambda = np.random.beta(10, 10)# 这里利用beta分布得到一个0到1的值,这里如果可以,尝试别的分布函数 data = mix_lambda * data + (1 - mix_lambda) * mix_data return data def expand_leads(recording, input_leads): output = np.zeros((12, recording.shape[1])) # recording.shape[1]: 5000 twelve_leads = ('I', 'II', 'III', 'aVR', 'aVL', 'aVF', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6') twelve_leads = [k.lower() for k in twelve_leads] # ['i', 'ii', 'iii', 'avr', 'avl', 'avf', 'v1', 'v2', 'v3', 'v4', 'v5', 'v6'] input_leads = [k.lower() for k in input_leads] # Here we can assume: # input_leads:I, II, V1, V2, V3, V4, V5, V6, # so the new input_leads: ['i', 'ii', 'v1', 'v2', 'v3', 'v4', 'v5', 'v6'] output_leads = np.zeros((12,)) # [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.] # idx: [0, 1, 6, 7, 8, 9, 10, 11] for i,k in enumerate(input_leads): idx = twelve_leads.index(k) output[idx,:] = recording[i,:] output_leads[idx] = 1 return output, output_leads ''' This is the lead_indicator:(l) [1. 1. 0. 0. 0. 0. 1. 1. 1. 1. 1. 1.] This is the lead_indicator:(l) [1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.] This is the lead_indicator:(l) [1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.] This is the lead_indicator:(l) [1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.] This is the lead_indicator:(l) [1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.] This is the lead_indicator:(l) [1. 1. 1. 1. 1. 1. 0. 0. 0. 0. 0. 0.] ''' class lead_exctractor: """ used to select specific leads or random choice of configurations Twelve leads: I, II, III, aVR, aVL, aVF, V1, V2, V3, V4, V5, V6 Eight leads: I, II, V1, V2, V3, V4, V5, V6 Six leads: I, II, III, aVR, aVL, aVF Four leads: I, II, III, V2 Three leads: I, II, V2 Two leads: I, II """ L2 = np.array([1,1,0,0,0,0,0,0,0,0,0,0]) L3 = np.array([1,1,0,0,0,0,0,1,0,0,0,0]) L4 = np.array([1,1,1,0,0,0,0,1,0,0,0,0]) L6 = np.array([1,1,1,1,1,1,0,0,0,0,0,0]) L8 = np.array([1,1,0,0,0,0,1,1,1,1,1,1]) L12 = np.array([1,1,1,1,1,1,1,1,1,1,1,1]) @staticmethod def get (x, num_leads, lead_indicator): if num_leads==None: # random choice output num_leads = random.choice([12,8,6,4,3,2]) if num_leads==12: # Twelve leads: I, II, III, aVR, aVL, aVF, V1, V2, V3, V4, V5, V6 return x, lead_indicator * lead_exctractor.L12 if num_leads==8: # Six leads: I, II, V1, V2, V3, V4, V5, V6 x = x * lead_exctractor.L8.reshape(12,1) return x,lead_indicator * lead_exctractor.L8 if num_leads==6: # Six leads: I, II, III, aVL, aVR, aVF x = x * lead_exctractor.L6.reshape(12,1) return x,lead_indicator * lead_exctractor.L6 if num_leads==4: # Six leads: I, II, III, V2 x = x * lead_exctractor.L4.reshape(12,1) return x,lead_indicator * lead_exctractor.L4 if num_leads==3: # Three leads: I, II, V2 x = x * lead_exctractor.L3.reshape(12,1) return x,lead_indicator * lead_exctractor.L3 if num_leads==2: # Two leads: II, V5 x = x * lead_exctractor.L2.reshape(12,1) return x,lead_indicator * lead_exctractor.L2 raise Exception("invalid-leads-number") class dataset: classes = ['164889003','164890007','6374002','426627000','733534002', '713427006','270492004','713426002','39732003','445118002', '164947007','251146004','111975006','698252002','426783006', '284470004','10370003','365413008','427172004','164917005', '47665007','427393009','426177001','427084000','164934002', '59931005'] normal_class = '426783006' equivalent_classes = [['713427006', '59118001'], ['284470004', '63593006'], ['427172004', '17338001'], ['733534002', '164909002']] def __init__(self, header_files, Mixup = 0, amount = 0, cutMix = 0, Mixup_no_label_interpolate =0, progressive_switch = False): self.files = [] self.sample = True self.num_leads = None for h in tqdm(header_files): tmp = dict() tmp['header'] = h tmp['record'] = h.replace('.hea','.mat') hdr = load_header(h) tmp['nsamp'] = get_nsamp(hdr) tmp['leads'] = get_leads(hdr) tmp['age'] = get_age(hdr) tmp['sex'] = get_sex(hdr) tmp['dx'] = get_labels(hdr) tmp['fs'] = get_frequency(hdr) tmp['target'] = np.zeros((26,)) tmp['dx'] = replace_equivalent_classes(tmp['dx'], dataset.equivalent_classes) for dx in tmp['dx']: # in SNOMED code is in scored classes if dx in dataset.classes: idx = dataset.classes.index(dx) tmp['target'][idx] = 1 self.files.append(tmp) # print("This is the target:", tmp['target']) # set filter parameters # Filtering: Data are filtered using a zero-phase method with 3rd order Butterworth bandpass filter # with frequency band from 1 Hz to 47 Hz. self.b, self.a = signal.butter(3, [1 / 250, 47 / 250], 'bandpass') self.files = pd.DataFrame(self.files) self.Mixup = Mixup self.cutMix = cutMix self.Mixup_no_label_interplate = Mixup_no_label_interpolate self.amount = amount self.progressive = progressive_switch self.current_epoch = 0 # Initialize the current epoch def set_epoch(self, epoch): self.current_epoch = epoch def train_valid_split(self, test_size): ''' test_size: 0.2 below is the value of each variable: files[0] <- "/media/jiang/ECG/physionet.org/files/challenge-2021/1.0.3/training/python-classifier-2021-main/training_data/chapman_shaoxing/g1/JS00001.hea" shape -> (999, 1) targets[1] <-[1. 0. 0. ... 0. 1. 0.] 26, shape->(999, 26) x_train[0] <-"/media/jiang/ECG/physionet.org/files/challenge-2021/1.0.3/training/python-classifier-2021-main/training_data/chapman_shaoxing/g1/JS00001.hea" shape -> (799,1) x_valid[0] <-"/media/jiang/ECG/physionet.org/files/challenge-2021/1.0.3/training/python-classifier-2021-main/training_data/chapman_shaoxing/g1/JS00651.hea" shape -> (200,1) ''' files = self.files['header'].to_numpy().reshape(-1,1) # print(files[0]) targets = np.stack(self.files['target'].to_list(), axis=0) print("This is the targets:", targets) x_train, y_train, x_valid, y_valid = iterative_train_test_split(files, targets, test_size=test_size) train = dataset(header_files=x_train[:,0].tolist()) train.num_leads=None train.sample=True valid = dataset(header_files=x_valid[:,0].tolist()) valid.num_leads=12 valid.sample=False return train, valid def summary(self, output): if output=='pandas': # print("@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@") # print("This is the target:",self.files['target'],type(self.files['target'])) return pd.Series(np.stack(self.files['target'].to_list(),axis=0).sum(axis=0),index=dataset.classes) if output=='numpy': return np.stack(self.files['target'].to_list(),axis=0).sum(axis=0) def __len__(self): return len(self.files) ''' fs: 500.0 sampling rate target: [0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0.] leads: ('I', 'II', 'III', 'aVR', 'aVL', 'aVF', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6') self.files.iloc[item]['record']: ../../python-classifier-2021-main/training_data/chapman_shaoxing/g1/JS00518.mat data: shape->(12 X 5000) ''' def __getitem__(self, item): fs = self.files.iloc[item]['fs'] target = self.files.iloc[item]['target'] leads = self.files.iloc[item]['leads'] data = load_recording(self.files.iloc[item]['record']) # print("This is the target:", target) # Set your threshold # threshold = 0.5 # print("This is the original target:", target) # print("This is the type of original target:", type(target)) if random.random() < self.Mixup : mix_sample_idx = random.randint(0, self.amount-1) mix_datum = load_recording(self.files.iloc[mix_sample_idx]['record']) mix_target = self.files.iloc[mix_sample_idx]['target'] data, alpha_value = mixup(data, mix_datum) target = alpha_value * target + (1-alpha_value) * mix_target # Binarize the labels based on the threshold # target[target >= threshold] = 1 # target[target < threshold] = 0 if random.random() < self.cutMix: mix_sample_idx = random.randint(0, self.amount-1) mix_datum = load_recording(self.files.iloc[mix_sample_idx]['record']) mix_target = self.files.iloc[mix_sample_idx]['target'] data, alpha_value = cutmix_fix_length(data, mix_datum) target = alpha_value * target + (1-alpha_value) * mix_target # target[target >= threshold] = 1 # target[target < threshold] = 0 if random.random()< self.Mixup_no_label_interplate: mix_sample_idx = random.randint(0, self.amount-1) mix_datum = load_recording(self.files.iloc[mix_sample_idx]['record']) data= same_shape_mixup(data, mix_datum) progressive_index = 0 # # below is the progessive data augmentation method # if self.progressive: # if self.current_epoch < 20: # if self.current_epoch % 4 == 0 or 3: # pass # elif self.current_epoch % 4 == 1: # proressive_index = 0.2 # else: # proressive_index = 0.5 # elif self.current_epoch < 40: # if self.current_epoch % 4 == 0 or 3: # pass # elif self.current_epoch % 4 == 1: # proressive_index = 0.7 # else: # proressive_index = 0.8 # else: # if self.current_epoch % 4 == 0 or 3: # pass # elif self.current_epoch % 4 == 1 or 2: # proressive_index = 1 # below is the wave-mix data augmentation method if self.progressive: if self.current_epoch < 5: match self.current_epoch: case 0: progressive_index=0.2 case 1: progressive_index=0.4 case 2: progressive_index=0.6 case 3: progressive_index=0.8 case 4: progressive_index=0.8 else : progressive_index = 0.8 if random.random() < progressive_index : mix_sample_idx = random.randint(0, self.amount-1) mix_datum = load_recording(self.files.iloc[mix_sample_idx]['record']) mix_target = self.files.iloc[mix_sample_idx]['target'] data, alpha_value = mixup(data, mix_datum) target = alpha_value * target + (1-alpha_value) * mix_target # Binarize the labels based on the threshold # target[target >= threshold] = 1 # target[target < threshold] = 0 # expand to 12 lead setup if original signal has less channels data, lead_indicator = expand_leads(data, input_leads=leads) data = np.nan_to_num(data) # resample to 500hz if fs == float(1000): data = signal.resample_poly(data, up=1, down=2, axis=-1) # to 500Hz fs = 500 elif fs == float(500): pass else: data = signal.resample(data, int(data.shape[1] * 500 / fs), axis=1) fs = 500 # below is the Butterworth digital and analog filter design. data = signal.filtfilt(self.b, self.a, data) ''' we filter out the sample which the length is 8192 via the below code. just like the random shift windows. ''' if self.sample: fs = int(fs) # random sample signal if len > 8192 samples if data.shape[-1] >= 8192: idx = data.shape[-1] - 8192-1 idx = np.random.randint(idx) data = data[:, idx:idx + 8192] ''' Z-Score Normalization ''' mu = np.nanmean(data, axis=-1, keepdims=True) std = np.nanstd(data, axis=-1, keepdims=True) #std = np.nanstd(data.flatten()) with warnings.catch_warnings(): warnings.simplefilter("ignore") data = (data - mu) / std data = np.nan_to_num(data) # random choose number of leads to keep data, lead_indicator = lead_exctractor.get(data, self.num_leads, lead_indicator) # print("This is the lead_indicator:(l)", type(lead_indicator), lead_indicator) return data, target