Preparing a data set for Deep Learning from zipped ABR raw data files
This notebook is about pre-processing the Auditory Brainstem Response (ABR) raw data files provided by Ingham et. al to create a data set for Deep Learning models.
The unprocessed ABR data files are available at Dryad .
Since the ABR raw data are available as zip-archives, these have to be unzipped and the extracted raw data files parsed so that the time series corresponding to the ABR audiograms can be saved in a single csv file.
The final data set contains the ABR time series, an individual mouse identifier, stimulus frequency, stimulus sound pressure level (SPL) and a manually determined hearing threshold. For each mouse there are different time series corresponding to six different sound stimuli: broadband click, 6, 12, 18, 24, and 30 kHz, each of which was measured for a range of sound pressure levels. The exact range of sound levels can vary between the different mice and stimuli.
The following is done:
-
The zip archives are unpacked.
-
The extracted ABR raw data files are parsed and collected in one csv file per archive.
-
The csv files are merged into a data set of time series. Each time series corresponds to an ABR audiogram measured for a mouse at a specific frequency and sound level.
-
The mouse phenotyping data are available in Excel format. The individual data sheets are combined into one mouse phenotyping data set, maintaining the mouse pipeline and the cohort type mapping. In addition, the hearing thresholds are added to the ABR audiogram data set.
-
The data sets are curated:
- there is a single curve per mouse, stimulus frequency and sound level,
- each sound level is included in the list of potential sound pressure levels,
- for each mouse for which an ABR audiogram has been measured, mouse phenotyping data are also provided.
Code Snippets
2 3 4 | %reload_ext autoreload %autoreload 2 %matplotlib inline |
8 9 | from IPython.core.display import display, HTML display(HTML("<style>.container { width:100% !important; }</style>")) |
13 14 15 16 17 18 19 20 21 22 23 | import os import glob import csv import time import sys import pandas as pd import numpy as np import matplotlib.pyplot as plt from zipfile import ZipFile |
27 28 | """Define the path to the zip-archives""" path2data = '...' |
32 33 34 35 36 37 | """ Define potential frequencies measured in Hz, with the exception of 100, which stands for a broadband frequency stimulus (click) """ freqs = [100, 6000, 12000, 18000, 24000, 30000] print(*['potential stimulus frequencies: ' + str(x) if x==100 else str(x)+'Hz' for x in freqs], sep = ", ") |
41 42 43 | """Define potential sound pressure levels measured in dB""" sound_levels = [x for x in range(0, 100, 5)] print(*['potential sound pressure levels [dB]: ' + str(x) if x==0 else str(x) for x in sound_levels], sep = ", ") |
47 48 49 50 51 52 53 54 55 56 57 58 59 60 | """Define the columns of the final data set""" columns = [] for col in ['mouse_id', 'frequency', 'sound_level']: columns.append(col) i = 1 time_steps = 1953 while i<= time_steps: columns.append('t'+str(i)) i+=1 """Required to be able to subsequently exclude test measurements""" columns.append('test') |
64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 | def plot_curves(_df, _mouse_id, _freq, _sl=None, _threshold=None): """ Plots ABR curves for a given mouse identifier and frequency. Parameters ---------- _df : pandas-data-frame A data frame that contains ABR time series in each row. _mouse_id : string A given mouse identifier. _freq : string A given stimulus frequency. _sl : string, default 'None' A given sound pressure level. _threshold : string, default 'None' A manually determined hearing threshold for a given sound stimulus. """ data_range = range(1, 1953) data_cols = ['t' + str(i) for i in data_range] yticks = _df.loc[(_df.mouse_id==_mouse_id) & (_df.frequency==_freq), 'sound_level'].unique() plt.rcParams.update({'font.size': 20}) plt.figure(figsize=(30, 24), dpi=200, facecolor='w', edgecolor='k') plt.xlabel('Time steps [overall 10ms]') plt.ylabel('Corresponding sound level [dB]') plt.title('Mouse ID: ' + str(_mouse_id) + ' - Frequency: ' + str(_freq)) plt.yticks(yticks, fontsize='small') plt.ylim((min(yticks) - 5, max(yticks) + 15)) if _sl: _df1 = _df[(_df['sound_level']==_sl) & (_df['mouse_id']==_mouse_id) & (_df['frequency']==_freq)][data_cols] idx = 0 while idx < len(_df1.index): plt.plot(data_range, _sl + 2.5*_df1.iloc[idx], color='#333F50', linewidth=2.5) idx+=1 else: for soundlevel in _df.loc[(_df.mouse_id==_mouse_id) & (_df.frequency==_freq), 'sound_level']: plt.plot(data_range, soundlevel + 2.5*_df[(_df['sound_level']==soundlevel) & (_df['mouse_id']==_mouse_id) & (_df['frequency']==_freq)][data_cols].iloc[0], color='#333F50', linewidth=2.5) if _threshold is not None: plt.hlines(_threshold, -1, 2000, colors=None, linestyles='dashed', label='threshold', linewidth=5.0) |
116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 | def get_duplicates(_df, _columns): """ Identifies duplicates by columns in a given dataset. Parameters ---------- _df : pandas-data-frame A data frame that contains ABR time series in each row. _colums : list Columns that may contain duplicates. Returns ------- A pandas-data-frame containing the duplicated rows from the input data frame. """ return pd.concat(g for _, g in _df.groupby(_columns) if len(g) > 1) |
136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 | def parse_file_content(_file_content, _df, _sound_levels): """ Parses the contents of an ABR raw data file. Parameters ---------- _file_content : bytes The bytes of a file in a zip-archive. _df : pandas-data-frame An empty data frame with specific columns to store the results. _sound_levels : list List of potential sound pressure levels. Returns ---------- _df : pandas-data-frame The input data frame populated with the contents of the file. """ delimiter = '=' idx = len(_df.index) - 1 for item in str(_file_content,'utf-8').split('\r\n'): if delimiter in item: row = item.split(delimiter) if row[0]: row[0] = row[0].strip() if row[0] == 'TraceName': _continue = ('ABR' in row[1]) if _continue: split = row[1].split(',') mouse_id = split[0].strip() freq = int(split[1].strip()) sl = int(split[2].strip()) _continue = sl in _sound_levels elif row[0] == 'TraceInfo': if _continue: steps = row[1].split(',')[2] _continue = int(steps) == time_steps if _continue: idx += 1 j = 1 elif 'TraceData' in row[0]: if _continue: _df.at[idx, 'mouse_id'] = mouse_id _df.at[idx, 'frequency'] = freq _df.at[idx, 'sound_level'] = sl _df.at[idx, 'test'] = False for elem in row[1].split(','): try: _df.at[idx, 't'+str(j)] = float(elem.strip()) j+=1 except ValueError: print("error on", elem, "!") elif 'Electrode Amplifier' in row[0]: if _continue: _df.at[idx, 'test'] = True return _df |
202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 | def parse_zip_file2csv(_file, _columns, _sound_levels, _path2file='tmp/'): """ Extracts a given zip archive, parses the contents of the extracted raw data files and saves the results in a single csv file. Parameters ---------- _file : string The name of the ABR raw data files zip archive. _columns : list The columns of the csv file containing raw data from the archive. _sound_levels : list List of potential sound pressure levels. _path2file : string, default 'tmp/' Path to csv file. """ with ZipFile(_file, 'r') as zipFile: fileNames = zipFile.namelist() fname = os.path.splitext(os.path.basename(_file))[0] + '.csv' fname = _path2file + fname for idx, fileName in enumerate(fileNames, 1): start_time = time.time() extractedFile = zipFile.read(fileName) df = parse_file_content(extractedFile, pd.DataFrame(columns=_columns), _sound_levels) df = df.dropna().reset_index() with open(fname, 'a') as f: df.to_csv(f, mode='a', header=f.tell()==0, index=False) del df elapsed_time = time.time() - start_time print('%d. file: %s (%s)' % (idx, fileName, time.strftime("%H:%M:%S", time.gmtime(elapsed_time)))) |
247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 | def cleanup_dataset(_df): ''' Cleans up the ABR raw Excludes test traces, drops duplicates and calculates mean value in case of multiple time series for same mouse, frequency and sound level. ''' # exclude test traces _df1 = _df[_df.test == False] # drop duplicates _df2 = _df1.drop_duplicates() # keep mean in case of multiple time series for same mouse, frequency and sound level _df3 = _df2.groupby(['mouse_id', 'frequency', 'sound_level']).mean().reset_index() return _df3 |
267 268 269 | zip_files = glob.glob(path2data + '/*.zip') for idx, zip_file in enumerate(sorted(zip_files), 1): print('%d. %s' % (idx, os.path.basename(zip_file))) |
273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 | start_time = time.time() print('\nStart time: %s' % time.strftime("%H:%M:%S", time.gmtime(start_time))) for idx, zip_file in enumerate(zip_files, start=1): basename = os.path.basename(zip_file) print('\n%d. zip archive: %s\n' % (idx, basename)) if not os.path.exists(os.path.splitext(basename)[0] + '.csv'): try: parse_zip_file2csv(zip_file, columns, sound_levels) except NotImplementedError: print('%s: %s' % ('NotImplementedError', basename)) except NameError: print('%s: %s' % ('NameError', basename)) elapsed_time = time.time() - start_time print('\nElapsed time: %s' % time.strftime("%H:%M:%S", time.gmtime(elapsed_time))) |
292 293 294 295 | """Get list of csv files""" csv_files = glob.glob('tmp/*.csv') for idx, csv_file in enumerate(sorted(csv_files),1): print('%d. %s' % (idx, os.path.basename(csv_file))) |
299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 | """Process data from the csv files to create an ABR data set""" df = None for idx, csv_file in enumerate(csv_files, 1): print('%d. %s' % (idx, os.path.basename(csv_file))) try: _df1 = pd.read_csv(csv_file) _df1 = _df1[_df1.frequency.isin(freqs) & _df1.sound_level.isin(sound_levels)] _df1.drop(columns=['index'], inplace=True) _df2 = cleanup_dataset(_df1) print(' stimulus frequencies: %s' % set(_df2.frequency)) print(' sound levels: %s' % set(_df2.sound_level)) print(' number of mouse ids: %d' % _df2.mouse_id.nunique()) if df is None: df = _df2.copy() else: df = pd.concat([df, _df2.copy()], ignore_index=True) del _df2 del _df1 except Exception: print('Error :-(') df.head() |
327 328 329 330 331 | """Check if any test curves in the data set""" if True not in df.test.unique(): print('There are no test curves in the data set.') else: print('The data set also contains test curves.') |
335 336 | """Get number of mice in the data set""" print('Mice: %d' % df.mouse_id.nunique()) |
340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 | """Define path to file containing the mouse phenotyping data set""" file = os.path.join(path2data, 'ABR_RESOURCE_Mouse ID.xlsx') """Read first Excel spreasheet""" mouse_data = pd.read_excel(file, sheet_name='Pipeline2 Controls', engine='openpyxl', usecols='B:R', parse_dates=True) """Delete empty rows""" mouse_data = mouse_data.dropna(axis=0, how='all').reset_index() """Delete the index column""" mouse_data.drop(columns=['index'], inplace=True) """Fill in the cohort type column. Possible values: 'con' for controls, 'mut' for mutants""" mouse_data.at[:,'cohort_type'] = 'con' """Read remaining Excel spreadsheets""" for sheet in ['Pipeline2 Mutants', 'MouseGP Controls', 'MouseGP Mutants', 'MGP Select Controls', 'MGP Select Mutants']: _mouse_data = pd.read_excel(file, sheet_name=sheet, engine='openpyxl', usecols='B:R', parse_dates=True) _mouse_data = _mouse_data.dropna(axis=0, how='all').reset_index() _mouse_data.drop(columns=['index'], inplace=True) _mouse_data.rename(columns={'Prefix': 'Colony Prefix', 'Barcode': 'Mouse Barcode', 'Name': 'Mouse Name', 'Age': 'Age at Test'}, inplace=True) if 'Mutants' in sheet: _mouse_data.at[:,'cohort_type'] = 'mut' else: _mouse_data.at[:,'cohort_type'] = 'con' mouse_data = mouse_data.append(_mouse_data, ignore_index=True) display(mouse_data.head(5)) |
368 369 370 371 | """Delete rows that do not have a valid mouse barcode""" mouse_data = mouse_data[mouse_data['Mouse Barcode'] != 'Mouse Barcode'].reset_index(drop=True) """Define new column for mouse IDs""" mouse_data['mouse_id'] = mouse_data['Mouse Barcode'] + ' ABR' |
375 376 | """Check if the number of mice in the data set changed""" print('Mice: %d' % mouse_data.mouse_id.nunique()) |
380 381 382 383 384 385 386 387 | """Always keep the first of duplicated rows""" mouse_data = mouse_data.drop_duplicates(['mouse_id', 'Click Threshold', '6kHz Threshold', '12kHz Threshold', '18kHz Threshold', '24kHz Threshold', '30kHz Threshold']) """Check if duplicated rows still exist""" duplicated = mouse_data[mouse_data.duplicated(['mouse_id', 'Click Threshold', '6kHz Threshold', '12kHz Threshold', '18kHz Threshold', '24kHz Threshold', '30kHz Threshold'])] if duplicated.empty: print('There are no duplicated rows.') else: display(duplicated) |
391 392 | """Check if the number of mice in the data set changed""" print('Mice: %d' % mouse_data.mouse_id.nunique()) |
396 397 398 399 | """Check for possible values for frequency-specific hearing thresholds""" print('Existing hearing thresholds') for col in ['Click Threshold', '6kHz Threshold', '12kHz Threshold', '18kHz Threshold', '24kHz Threshold', '30kHz Threshold']: print(' * %s [dB]: %s' % (col.split(' ')[0], sorted(list(mouse_data[col].unique())))) |
403 404 405 | """Make sure that mouse phenotyping data are available for all mice with measured ABR curves""" df = df[df.mouse_id.isin(mouse_data.mouse_id.unique())].reset_index(drop=True) print('Mice with measured ABR curves: %d' % df.mouse_id.nunique()) |
409 410 411 | """Make sure that ABR curves have been measured for all mice with phenotyping data""" mouse_data = mouse_data[mouse_data.mouse_id.isin(df.mouse_id.unique())].reset_index(drop=True) print('Mice with phenotyping data: %d' % mouse_data.mouse_id.nunique()) |
415 416 417 418 419 420 421 | """Map the hearing threshold columns to corresponding stimulus frequencies""" col_mapping = {100: 'Click Threshold', 6000: '6kHz Threshold', 12000: '12kHz Threshold', 18000: '18kHz Threshold', 24000: '24kHz Threshold', 30000: '30kHz Threshold'} |
425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 | """Add a hearing threshold column to ABR data set""" df1 = None for freq in col_mapping: print('stimulus frequency: %d%s' % (freq, '' if freq == 100 else 'Hz')) col = col_mapping[freq] df_freq = df.loc[df.frequency == freq] df_freq = pd.merge(left=df_freq, right=mouse_data[['mouse_id', col]], on='mouse_id', how='left') df_freq.rename(columns={col: 'threshold'}, inplace=True) if df1 is None: print(' create df1 ...') df1 = df_freq.copy() else: print(' concat results ...') df1 = pd.concat([df1, df_freq]) del df_freq display(df1.head(5)) del df |
446 447 448 449 450 451 452 453 | """Always keep the first of duplicated rows""" df1 = df1.drop_duplicates() """Check if duplicated rows still exist""" duplicated = df1[df1.duplicated()] if duplicated.empty: print('There are no duplicated rows.') else: display(duplicated) |
457 458 | """Check if number of mice in the data set changed""" print('Mice: %d' % df1.mouse_id.nunique()) |
462 463 464 465 | """List existing stimulus frequencies""" print('Existing stimulus frequencies: %s' % df1.frequency.unique()) """List existing sound levels""" print('Existing sound levels: %s' % df1.sound_level.unique()) |
469 470 471 472 473 | """Replace NaN threshold values""" AUL = 999 df1['threshold'] = df1['threshold'].fillna(AUL) df1['threshold'] = df1['threshold'].astype('int32') print('Existing hearing thresholds [dB]: %s' % sorted(df1['threshold'].unique())) |
477 478 479 | """Keep only sound levels from the potential sound levels list""" df1 = df1[df1.threshold.isin(sound_levels + [AUL])] print('Existing hearing thresholds [dB]: %s' % sorted(df1['threshold'].unique())) |
483 484 | """Check if number of mice in the data set changed""" print('Mice: %d' % df1.mouse_id.nunique()) |
488 489 490 491 492 493 494 495 496 | """Checking for mouse IDs with multiple hearing thresholds for a given stimulus frequency""" mouse_ids = df1[df1.columns.drop('threshold')][df1[df1.columns.drop('threshold')].duplicated()].mouse_id.unique() print('Mouse IDs: %s' % mouse_ids) """Exclude these mouse IDs from the data set""" if mouse_ids.any(): display(mouse_data[mouse_data.mouse_id.isin(mouse_ids)]) df2 = df1[~df1.mouse_id.isin(mouse_ids)] del df1 |
500 501 | """Make sure the mouse phenotyping data set contains only mice having ABR curves measured for valid sound levels""" mouse_data2 = mouse_data[mouse_data.mouse_id.isin(df2.mouse_id.unique())].reset_index(drop=True) |
505 506 | """Both data sets should have the same number of mice""" print('%d mice with ABR curves = %d mice with phenotyping data : %s' % (df2.mouse_id.nunique(), mouse_data2.mouse_id.nunique(), (df2.mouse_id.nunique()==mouse_data2.mouse_id.nunique()))) |
510 511 512 | """Save ABR curves data set to csv file""" df2[df2.columns.drop('test')].to_csv('abr_curves.csv', index=False) display(df2.head(5)) |
516 517 518 | """Save mouse phenotyping data set to csv file""" mouse_data2.to_csv('mouse_data.csv', index=False) display(mouse_data2.head(5)) |
522 523 524 525 | import matplotlib.gridspec as gridspec import matplotlib.ticker as ticker import random |
529 530 | """Create random list of mouse IDs""" mice = random.sample(list(df2.mouse_id.unique()), 100) |
534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 | plt.rcParams['figure.figsize'] = [10, 8] """Define columns with time series data""" data_cols = ['t%d' %i for i in range(1, 1951)] data_range = range(1, 1951) for mouse in mice[:1]: fig = plt.figure(constrained_layout=True, figsize=(80, 64)) sound_levels = df2['sound_level'].unique() df = df2[df2.mouse_id == mouse] cols = 2 rows = 3 #int(len(df.frequency.unique()) / cols) col = 0 row = 0 spec = gridspec.GridSpec(ncols=cols, nrows=rows, figure=fig) f_ax = {} for idx, freq in enumerate(df.frequency.unique()): f_ax[idx] = fig.add_subplot(spec[row, col]) if freq == 100: f_ax[idx].set_title('Click') else: f_ax[idx].set_title('%dkHz' % (freq/1000)) f_ax[idx].set_yticks(sound_levels) """Get hearing threshold for given stimulus frequency""" human_thr = None thr = df[df['frequency'] == freq]['threshold'].unique() if len(thr) > 0: human_thr = thr[0] """Plot the curves""" plt.rcParams.update({'font.size': 20}) f_ax[idx].set_xlabel('Timesteps [overall 10ms]') f_ax[idx].set_ylabel('Sound level [dB]') if freq == 100: f_ax[idx].set_title('Click - manually assigned threshold: %sdB' % human_thr) else: f_ax[idx].set_title('%dkHz - manually assigned threshold: %sdB' % (freq/1000, human_thr)) for sound_level in df.loc[df['frequency'] == freq, 'sound_level']: f_ax[idx].plot(data_range, sound_level + 2.5 * df[(df['sound_level'] == sound_level) & (df['frequency'] == freq)][data_cols].iloc[0], linewidth=2.5) if human_thr and human_thr != 999: f_ax[idx].hlines(y=human_thr, xmin=data_range[0], xmax=data_range[-1], linewidth=2.5, linestyles='dotted') col += 1 if col == cols: row += 1 col = 0 labels = [sl for sl in sound_levels] f_ax[idx].yaxis.set_major_formatter(ticker.FixedFormatter(labels)) fig.suptitle('Mouse ID: %s' % mouse, fontsize=24) # _file = 'curves/' + mouse.replace(' ', '_') # plt.savefig(_file) |
Support
- Future updates
Related Workflows





