@@ -144,16 +144,6 @@ def set_uncorrected_matrix(self, pMatrix):
144
144
def load_cool_only_init (self , pMatrixFile ):
145
145
self .cooler_file = cooler .Cooler (pMatrixFile )
146
146
147
- # def load_cool_bins(self, pChr=None):
148
- # if pChr:
149
- # return self.cooler_file.bins().fetch(pChr)
150
- # else:
151
- # if 'weight' in self.cooler_file.bins():
152
- # cut_intervals_data_frame = self.cooler_file.bins()[['chrom', 'start', 'end', 'weight']][:]
153
- # else:
154
- # cut_intervals_data_frame = self.cooler_file.bins()[['chrom', 'start', 'end']][:]
155
- # self.cut_intervals = [tuple(x) for x in cut_intervals_data_frame.values]
156
-
157
147
def load_cool_matrix (self , pChr ):
158
148
return self .cooler_file .matrix (balance = False , as_pixels = True ).fetch (pChr )
159
149
@@ -177,40 +167,30 @@ def load_cool(self, pMatrixFile, pChrnameList=None, pMatrixOnly=None, pIntraChro
177
167
if pChrnameList is None :
178
168
# some bug in csr funtion of cooler or numpy to double the data
179
169
matrix_data_frame = self .cooler_file .matrix (balance = False , as_pixels = True )[:]
180
- # log.info("matrix data frame LOAD: {}".format(matrix_data_frame.values))
181
- # log.info("matrix data frame data LOAD: {}".format(matrix_data_frame.values[:, 2].flatten()))
182
- # log.info("matrix data frame row LOAD: {}".format(matrix_data_frame.values[:, 1].flatten()))
183
- # log.info("matrix data frame col LOAD: {}".format(matrix_data_frame.values[:, 0].flatten()))
184
-
170
+ log .info ("matrix data frame LOAD: {}" .format (matrix_data_frame .values ))
185
171
length = len (self .cooler_file .bins ()[['chrom' ]][:].index )
186
172
187
173
matrix = csr_matrix ((matrix_data_frame .values [:, 2 ].flatten (), (matrix_data_frame .values [:, 0 ].flatten (), matrix_data_frame .values [:, 1 ].flatten ())), shape = (length , length ))
188
- # log.info("matrix data csr: {}".format(matrix) )
189
174
else :
190
175
if len (pChrnameList ) == 1 :
191
-
192
176
try :
193
177
matrix = self .cooler_file .matrix (balance = False , sparse = True ).fetch (pChrnameList [0 ]).tocsr ()
194
- # matrix_data_frame = self.cooler_file.matrix(balance=False, as_pixels=True).fetch(pChrnameList[0])
195
- # length = len(self.cooler_file.bins().fetch(pChrnameList[0])[['chrom']].index)
196
- # matrix = csr_matrix((matrix_data_frame.values[:, 2].flatten(), (matrix_data_frame.values[:, 0].flatten(), matrix_data_frame.values[:, 1].flatten())), shape=(length, length))
197
-
198
178
except ValueError :
199
179
exit ("Wrong chromosome format. Please check UCSC / ensembl notation." )
200
-
201
180
else :
202
181
exit ("Operation to load more as one region is not supported." )
203
182
204
183
cut_intervals_data_frame = None
205
184
correction_factors_data_frame = None
185
+
206
186
if pChrnameList is not None :
207
187
if len (pChrnameList ) == 1 :
208
188
cut_intervals_data_frame = self .cooler_file .bins ().fetch (pChrnameList [0 ])
189
+
209
190
if 'weight' in cut_intervals_data_frame :
210
191
correction_factors_data_frame = cut_intervals_data_frame ['weight' ]
211
192
else :
212
193
exit ("Operation to load more than one chr from bins is not supported." )
213
-
214
194
else :
215
195
if 'weight' in self .cooler_file .bins ():
216
196
correction_factors_data_frame = self .cooler_file .bins ()[['weight' ]][:]
@@ -224,16 +204,12 @@ def load_cool(self, pMatrixFile, pChrnameList=None, pMatrixOnly=None, pIntraChro
224
204
self .set_uncorrected_matrix (deepcopy (matrix ))
225
205
matrix .eliminate_zeros ()
226
206
matrix .data = matrix .data .astype (float )
227
- log .info ("Applying correction factors on matrix..." )
228
207
229
208
instances , features = matrix .nonzero ()
230
- log .info ('len matrix.data: {}' .format (len (matrix .data )))
231
- log .info ('len instance: {}' .format (len (instances )))
232
- log .info ('len features: {}' .format (len (features )))
233
- log .info ('len correctionfactors.values: {}' .format (len (correction_factors_data_frame .values )))
234
-
235
- for i in range (len (matrix .data )):
236
- matrix .data [i ] /= correction_factors_data_frame .values [instances [i ]] * correction_factors_data_frame .values [features [i ]]
209
+ instances_factors = correction_factors_data_frame .values [instances ].flatten ()
210
+ features_factors = correction_factors_data_frame .values [features ].flatten ()
211
+ instances_factors *= features_factors
212
+ matrix .data /= instances_factors
237
213
correction_factors = correction_factors_data_frame .values
238
214
239
215
cut_intervals = []
@@ -258,14 +234,9 @@ def load_cool(self, pMatrixFile, pChrnameList=None, pMatrixOnly=None, pIntraChro
258
234
nan_bins = None
259
235
260
236
distance_counts = None
261
- # log.info("matrix data csr BEFORE FILL_LOWER: {}".format(matrix) )
262
237
263
238
matrix = hiCMatrix .fillLowerTriangle (matrix )
264
239
265
- # log.info("matrix data csr AFTER FILL_LOWER: {}".format(matrix) )
266
-
267
- # log.info('cut_intervals: {}'.format(cut_intervals))
268
-
269
240
return matrix , cut_intervals , nan_bins , distance_counts , correction_factors
270
241
271
242
@staticmethod
@@ -317,7 +288,11 @@ def load_h5(matrix_filename):
317
288
distance_counts = f .root .correction_factors .read ()
318
289
else :
319
290
distance_counts = None
320
- log .info ("H5:::matrix.data[:10]: {}" .format (matrix .data [:10 ]))
291
+ # log.info("H5:::matrix.data[:10]: {}".format(matrix.data[:10]))
292
+ # log.info("H5:::matrix.data[:10]: {}".format(matrix.data[:10]))
293
+ # log.info("H5:::matrix.data[100:110]: {}".format(matrix.data[100:110]))
294
+ # log.info("H5:::matrix.data[500:510]: {}".format(matrix.data[500:510]))
295
+ # log.info("H5:::matrix.data[-10:-1]: {}".format(matrix.data[-10:-1]))
321
296
return matrix , cut_intervals , nan_bins , distance_counts , correction_factors
322
297
323
298
@staticmethod
@@ -1368,19 +1343,18 @@ def save_cooler(self, pFileName, pDataFrameBins=None, pDataFrameMatrix=None, pSy
1368
1343
pDataFrameBins ['end' ] = pDataFrameBins ['end' ].astype (np .int64 )
1369
1344
bins_data_frame = pDataFrameBins
1370
1345
else :
1371
- log .info ("self.cut_intervals[:10]: {}" .format (self .cut_intervals [:10 ]))
1372
- # log.info("np.array(self.cut_intervals)[:, :3][:10]: {}", np.array(self.cut_intervals)[:, :3][:10])
1373
1346
cut_intervals_ = []
1347
+ # extra_list = []
1374
1348
for value in self .cut_intervals :
1375
1349
cut_intervals_ .append (tuple ((value [0 ], value [1 ], value [2 ])))
1350
+ # extra_list.append(value[3])
1376
1351
bins_data_frame = pd .DataFrame (cut_intervals_ , columns = ['chrom' , 'start' , 'end' ])
1377
- # log.info("bins_data_frame: {}".format(bins_data_frame))
1378
1352
# append correction factors if they exist
1379
1353
if self .correction_factors is not None :
1380
1354
log .debug ("Correction factors present! self.correction_factors is not None" )
1381
1355
1382
1356
bins_data_frame = bins_data_frame .assign (weight = self .correction_factors )
1383
- # log.info(" bins_data_frame II : {}".format(bins_data_frame) )
1357
+ # bins_data_frame = bins_data_frame.assign(extra=extra_list )
1384
1358
1385
1359
if pDataFrameMatrix :
1386
1360
if pDataFrameMatrix ['bin1_id' ].dtypes != 'int64' :
@@ -1400,10 +1374,18 @@ def save_cooler(self, pFileName, pDataFrameBins=None, pDataFrameMatrix=None, pSy
1400
1374
log .info ("Reverting correction factors on matrix..." )
1401
1375
1402
1376
instances , features = matrix .nonzero ()
1403
- for i in range (len (matrix .data )):
1404
- matrix .data [i ] *= self .correction_factors [instances [i ]] * self .correction_factors [features [i ]]
1405
1377
1378
+ instances_factors = self .correction_factors [instances ].flatten ()
1379
+ features_factors = self .correction_factors [features ].flatten ()
1380
+
1381
+ instances_factors *= features_factors
1382
+ matrix .data *= instances_factors
1383
+ instances_factors = None
1384
+ features_factors = None
1385
+
1386
+ matrix .data = np .rint (matrix .data )
1406
1387
matrix .data = matrix .data .astype (int )
1388
+
1407
1389
data = matrix .data .tolist ()
1408
1390
1409
1391
elif self .uncorrected_matrix is not None :
@@ -1417,11 +1399,9 @@ def save_cooler(self, pFileName, pDataFrameBins=None, pDataFrameMatrix=None, pSy
1417
1399
1418
1400
cooler ._writer .COUNT_DTYPE = matrix .dtype
1419
1401
1420
- log .info ("Data in save uncorrected II: {}" .format (data [:10 ]))
1421
-
1422
1402
matrix_tuple_list = zip (instances .tolist (), features .tolist (), data )
1423
1403
matrix_data_frame = pd .DataFrame (matrix_tuple_list , columns = ['bin1_id' , 'bin2_id' , 'count' ])
1424
- log . info ( "matrix data frame SAVE: {}" . format ( matrix_data_frame . values [: 10 ]))
1404
+
1425
1405
cooler .io .create (cool_uri = pFileName ,
1426
1406
bins = bins_data_frame ,
1427
1407
pixels = matrix_data_frame )
0 commit comments