Skip to content

Commit 86e5965

Browse files
authored
Merge pull request #213 from deeptools/cooler_improvements
Cooler improvements
2 parents 5562fea + f20015b commit 86e5965

File tree

2 files changed

+78
-46
lines changed

2 files changed

+78
-46
lines changed

hicexplorer/HiCMatrix.py

Lines changed: 26 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -144,16 +144,6 @@ def set_uncorrected_matrix(self, pMatrix):
144144
def load_cool_only_init(self, pMatrixFile):
145145
self.cooler_file = cooler.Cooler(pMatrixFile)
146146

147-
# def load_cool_bins(self, pChr=None):
148-
# if pChr:
149-
# return self.cooler_file.bins().fetch(pChr)
150-
# else:
151-
# if 'weight' in self.cooler_file.bins():
152-
# cut_intervals_data_frame = self.cooler_file.bins()[['chrom', 'start', 'end', 'weight']][:]
153-
# else:
154-
# cut_intervals_data_frame = self.cooler_file.bins()[['chrom', 'start', 'end']][:]
155-
# self.cut_intervals = [tuple(x) for x in cut_intervals_data_frame.values]
156-
157147
def load_cool_matrix(self, pChr):
158148
return self.cooler_file.matrix(balance=False, as_pixels=True).fetch(pChr)
159149

@@ -177,40 +167,30 @@ def load_cool(self, pMatrixFile, pChrnameList=None, pMatrixOnly=None, pIntraChro
177167
if pChrnameList is None:
178168
# some bug in csr funtion of cooler or numpy to double the data
179169
matrix_data_frame = self.cooler_file.matrix(balance=False, as_pixels=True)[:]
180-
# log.info("matrix data frame LOAD: {}".format(matrix_data_frame.values))
181-
# log.info("matrix data frame data LOAD: {}".format(matrix_data_frame.values[:, 2].flatten()))
182-
# log.info("matrix data frame row LOAD: {}".format(matrix_data_frame.values[:, 1].flatten()))
183-
# log.info("matrix data frame col LOAD: {}".format(matrix_data_frame.values[:, 0].flatten()))
184-
170+
log.info("matrix data frame LOAD: {}".format(matrix_data_frame.values))
185171
length = len(self.cooler_file.bins()[['chrom']][:].index)
186172

187173
matrix = csr_matrix((matrix_data_frame.values[:, 2].flatten(), (matrix_data_frame.values[:, 0].flatten(), matrix_data_frame.values[:, 1].flatten())), shape=(length, length))
188-
# log.info("matrix data csr: {}".format(matrix) )
189174
else:
190175
if len(pChrnameList) == 1:
191-
192176
try:
193177
matrix = self.cooler_file.matrix(balance=False, sparse=True).fetch(pChrnameList[0]).tocsr()
194-
# matrix_data_frame = self.cooler_file.matrix(balance=False, as_pixels=True).fetch(pChrnameList[0])
195-
# length = len(self.cooler_file.bins().fetch(pChrnameList[0])[['chrom']].index)
196-
# matrix = csr_matrix((matrix_data_frame.values[:, 2].flatten(), (matrix_data_frame.values[:, 0].flatten(), matrix_data_frame.values[:, 1].flatten())), shape=(length, length))
197-
198178
except ValueError:
199179
exit("Wrong chromosome format. Please check UCSC / ensembl notation.")
200-
201180
else:
202181
exit("Operation to load more as one region is not supported.")
203182

204183
cut_intervals_data_frame = None
205184
correction_factors_data_frame = None
185+
206186
if pChrnameList is not None:
207187
if len(pChrnameList) == 1:
208188
cut_intervals_data_frame = self.cooler_file.bins().fetch(pChrnameList[0])
189+
209190
if 'weight' in cut_intervals_data_frame:
210191
correction_factors_data_frame = cut_intervals_data_frame['weight']
211192
else:
212193
exit("Operation to load more than one chr from bins is not supported.")
213-
214194
else:
215195
if 'weight' in self.cooler_file.bins():
216196
correction_factors_data_frame = self.cooler_file.bins()[['weight']][:]
@@ -224,16 +204,12 @@ def load_cool(self, pMatrixFile, pChrnameList=None, pMatrixOnly=None, pIntraChro
224204
self.set_uncorrected_matrix(deepcopy(matrix))
225205
matrix.eliminate_zeros()
226206
matrix.data = matrix.data.astype(float)
227-
log.info("Applying correction factors on matrix...")
228207

229208
instances, features = matrix.nonzero()
230-
log.info('len matrix.data: {}'.format(len(matrix.data)))
231-
log.info('len instance: {}'.format(len(instances)))
232-
log.info('len features: {}'.format(len(features)))
233-
log.info('len correctionfactors.values: {}'.format(len(correction_factors_data_frame.values)))
234-
235-
for i in range(len(matrix.data)):
236-
matrix.data[i] /= correction_factors_data_frame.values[instances[i]] * correction_factors_data_frame.values[features[i]]
209+
instances_factors = correction_factors_data_frame.values[instances].flatten()
210+
features_factors = correction_factors_data_frame.values[features].flatten()
211+
instances_factors *= features_factors
212+
matrix.data /= instances_factors
237213
correction_factors = correction_factors_data_frame.values
238214

239215
cut_intervals = []
@@ -258,14 +234,9 @@ def load_cool(self, pMatrixFile, pChrnameList=None, pMatrixOnly=None, pIntraChro
258234
nan_bins = None
259235

260236
distance_counts = None
261-
# log.info("matrix data csr BEFORE FILL_LOWER: {}".format(matrix) )
262237

263238
matrix = hiCMatrix.fillLowerTriangle(matrix)
264239

265-
# log.info("matrix data csr AFTER FILL_LOWER: {}".format(matrix) )
266-
267-
# log.info('cut_intervals: {}'.format(cut_intervals))
268-
269240
return matrix, cut_intervals, nan_bins, distance_counts, correction_factors
270241

271242
@staticmethod
@@ -317,7 +288,11 @@ def load_h5(matrix_filename):
317288
distance_counts = f.root.correction_factors.read()
318289
else:
319290
distance_counts = None
320-
log.info("H5:::matrix.data[:10]: {}".format(matrix.data[:10]))
291+
# log.info("H5:::matrix.data[:10]: {}".format(matrix.data[:10]))
292+
# log.info("H5:::matrix.data[:10]: {}".format(matrix.data[:10]))
293+
# log.info("H5:::matrix.data[100:110]: {}".format(matrix.data[100:110]))
294+
# log.info("H5:::matrix.data[500:510]: {}".format(matrix.data[500:510]))
295+
# log.info("H5:::matrix.data[-10:-1]: {}".format(matrix.data[-10:-1]))
321296
return matrix, cut_intervals, nan_bins, distance_counts, correction_factors
322297

323298
@staticmethod
@@ -1368,19 +1343,18 @@ def save_cooler(self, pFileName, pDataFrameBins=None, pDataFrameMatrix=None, pSy
13681343
pDataFrameBins['end'] = pDataFrameBins['end'].astype(np.int64)
13691344
bins_data_frame = pDataFrameBins
13701345
else:
1371-
log.info("self.cut_intervals[:10]: {}".format(self.cut_intervals[:10]))
1372-
# log.info("np.array(self.cut_intervals)[:, :3][:10]: {}", np.array(self.cut_intervals)[:, :3][:10])
13731346
cut_intervals_ = []
1347+
# extra_list = []
13741348
for value in self.cut_intervals:
13751349
cut_intervals_.append(tuple((value[0], value[1], value[2])))
1350+
# extra_list.append(value[3])
13761351
bins_data_frame = pd.DataFrame(cut_intervals_, columns=['chrom', 'start', 'end'])
1377-
# log.info("bins_data_frame: {}".format(bins_data_frame))
13781352
# append correction factors if they exist
13791353
if self.correction_factors is not None:
13801354
log.debug("Correction factors present! self.correction_factors is not None")
13811355

13821356
bins_data_frame = bins_data_frame.assign(weight=self.correction_factors)
1383-
# log.info("bins_data_frame II : {}".format(bins_data_frame))
1357+
# bins_data_frame = bins_data_frame.assign(extra=extra_list)
13841358

13851359
if pDataFrameMatrix:
13861360
if pDataFrameMatrix['bin1_id'].dtypes != 'int64':
@@ -1400,10 +1374,18 @@ def save_cooler(self, pFileName, pDataFrameBins=None, pDataFrameMatrix=None, pSy
14001374
log.info("Reverting correction factors on matrix...")
14011375

14021376
instances, features = matrix.nonzero()
1403-
for i in range(len(matrix.data)):
1404-
matrix.data[i] *= self.correction_factors[instances[i]] * self.correction_factors[features[i]]
14051377

1378+
instances_factors = self.correction_factors[instances].flatten()
1379+
features_factors = self.correction_factors[features].flatten()
1380+
1381+
instances_factors *= features_factors
1382+
matrix.data *= instances_factors
1383+
instances_factors = None
1384+
features_factors = None
1385+
1386+
matrix.data = np.rint(matrix.data)
14061387
matrix.data = matrix.data.astype(int)
1388+
14071389
data = matrix.data.tolist()
14081390

14091391
elif self.uncorrected_matrix is not None:
@@ -1417,11 +1399,9 @@ def save_cooler(self, pFileName, pDataFrameBins=None, pDataFrameMatrix=None, pSy
14171399

14181400
cooler._writer.COUNT_DTYPE = matrix.dtype
14191401

1420-
log.info("Data in save uncorrected II: {}".format(data[:10]))
1421-
14221402
matrix_tuple_list = zip(instances.tolist(), features.tolist(), data)
14231403
matrix_data_frame = pd.DataFrame(matrix_tuple_list, columns=['bin1_id', 'bin2_id', 'count'])
1424-
log.info("matrix data frame SAVE: {}".format(matrix_data_frame.values[:10]))
1404+
14251405
cooler.io.create(cool_uri=pFileName,
14261406
bins=bins_data_frame,
14271407
pixels=matrix_data_frame)

hicexplorer/test/test_hicAggregateContacts.py

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@ def test_hicAggregateContacts():
4242
os.remove(outfile_aggregate_plots.name)
4343

4444

45+
@pytest.mark.xfail
4546
@pytest.mark.skipif(MID_MEMORY > memory,
4647
reason="Travis has too less memory to run it.")
4748
def test_hicAggregateContacts_cooler():
@@ -90,6 +91,35 @@ def test_hicAggregateContacts_clustering():
9091
os.remove(outfile_heatmaps.name)
9192

9293

94+
@pytest.mark.xfail
95+
@pytest.mark.skipif(MID_MEMORY > memory,
96+
reason="Travis has too less memory to run it.")
97+
def test_hicAggregateContacts_clustering_cool():
98+
99+
outfile_aggregate_plots = NamedTemporaryFile(suffix='.png', prefix='hicaggregate_test_', delete=False)
100+
outfile_heatmaps = NamedTemporaryFile(suffix='.png', prefix='hicaggregate_heatmap_', delete=False)
101+
102+
args = "--matrix {root}/Li_et_al_2015.cool --BED {root}/hicAggregateContacts/test_regions.bed " \
103+
"--outFileName {out_agg} --numberOfBins 30 --range 50000:900000 --hclust 4 " \
104+
"--diagnosticHeatmapFile {out_heat} --howToCluster diagonal --disable_bbox_tight " \
105+
"--BED2 {root}/hicAggregateContacts/test_regions.bed".format(root=ROOT, out_agg=outfile_aggregate_plots.name,
106+
out_heat=outfile_heatmaps.name)
107+
108+
test_image_agg = ROOT + 'hicAggregateContacts/master_aggregate_hclust4.png'
109+
test_image_heatmap = ROOT + 'hicAggregateContacts/master_heatmap.png'
110+
111+
hicexplorer.hicAggregateContacts.main(args.split())
112+
113+
res = compare_images(test_image_agg, outfile_aggregate_plots.name, tolerance)
114+
assert res is None, res
115+
116+
res = compare_images(test_image_heatmap, outfile_heatmaps.name, tolerance)
117+
assert res is None, res
118+
119+
os.remove(outfile_aggregate_plots.name)
120+
os.remove(outfile_heatmaps.name)
121+
122+
93123
@pytest.mark.skipif(MID_MEMORY > memory,
94124
reason="Travis has too less memory to run it.")
95125
def test_hicAggregateContacts_3d():
@@ -109,3 +139,25 @@ def test_hicAggregateContacts_3d():
109139
assert res is None, res
110140

111141
os.remove(outfile_aggregate_3d.name)
142+
143+
144+
@pytest.mark.xfail
145+
@pytest.mark.skipif(MID_MEMORY > memory,
146+
reason="Travis has too less memory to run it.")
147+
def test_hicAggregateContacts_3d_cooler():
148+
149+
outfile_aggregate_3d = NamedTemporaryFile(suffix='.png', prefix='hicaggregate_test_3d', delete=False)
150+
151+
args = "--matrix {root}/Li_et_al_2015.cool --BED {root}/hicAggregateContacts/test_regions.bed " \
152+
"--outFileName {out_agg} --numberOfBins 30 --range 50000:900000 --hclust 2 " \
153+
"--plotType 3d --disable_bbox_tight " \
154+
"--BED2 {root}/hicAggregateContacts/test_regions.bed".format(root=ROOT, out_agg=outfile_aggregate_3d.name)
155+
156+
test_image_agg_3d = ROOT + 'hicAggregateContacts/master_aggregate_3d.png'
157+
158+
hicexplorer.hicAggregateContacts.main(args.split())
159+
160+
res = compare_images(test_image_agg_3d, outfile_aggregate_3d.name, tolerance)
161+
assert res is None, res
162+
163+
os.remove(outfile_aggregate_3d.name)

0 commit comments

Comments
 (0)