NakhlehLab / MaCroDNA

A high-accuracy tool for integrating scDNA-seq and scRNA-seq data
MIT License
1 stars 1 forks source link

installtion error! #2

Closed naila53 closed 3 weeks ago

naila53 commented 4 weeks ago

Hi, thanks for developing this tool! I kept getting this error when trying to install the tool python macrodna.py ile "/home/naila22/.conda/envs/macroDNA/lib/python3.12/site-packages/pandas/core/indexing.py", line 1178, in __getitem__ check_dict_or_set_indexers(key) File "/home/naila22/.conda/envs/macroDNA/lib/python3.12/site-packages/pandas/core/indexing.py", line 2774, in check_dict_or_set_indexers raise TypeError( TypeError: Passing a set as an indexer is not supported. Use a list instead.

this solved it for me: I changed this line: genes = set(self.dna_df.index).intersection(self.rna_df.index.to_list()) to this : genes = list(set(self.dna_df.index).intersection(self.rna_df.index.to_list()))

the installation works fine and prints out this output, however, I can't import it from python!

MaCroDNA will be run for 1 steps
the smallest set has 4 number of cells
Restricted license - for non-production use only - expires 2025-11-24
Gurobi Optimizer version 11.0.3 build v11.0.3rc0 (linux64 - "Ubuntu 22.04.2 LTS")

CPU model: Intel(R) Xeon(R) Gold 6130 CPU @ 2.10GHz, instruction set [SSE2|AVX|AVX2|AVX512]
Thread count: 64 physical cores, 128 logical processors, using up to 32 threads

Optimize a model with 9 rows, 16 columns and 48 nonzeros
Model fingerprint: 0xd5e76473
Variable types: 0 continuous, 16 integer (16 binary)
Coefficient statistics:
  Matrix range     [1e+00, 1e+00]
  Objective range  [2e-01, 1e+00]
  Bounds range     [1e+00, 1e+00]
  RHS range        [1e+00, 4e+00]
Found heuristic solution: objective 2.8300077
Presolve removed 9 rows and 16 columns
Presolve time: 0.00s
Presolve: All rows and columns removed

Explored 0 nodes (0 simplex iterations) in 0.00 seconds (0.00 work units)
Thread count was 1 (of 128 available processors)

Solution count 1: 2.83001 

Optimal solution found (tolerance 1.00e-04)
Best objective 2.830007718086e+00, best bound 2.830007718086e+00, gap 0.0000%
IsMIP: 1
Solved with MIPFocus: 0
The model has been optimized
Obj: 2.83001
the number of associations in the correspondence matrix 4.0
**********
Finish Mapping
Test Success
**********
python
Python 3.12.7 | packaged by conda-forge | (main, Oct  4 2024, 16:05:46) [GCC 13.3.0] on linux
Type "help", "copyright", "credits" or "license" for more information.
>>> from MaCroDNA import MaCroDNA
Traceback (most recent call last):
  File "<stdin>", line 1, in <module>
ModuleNotFoundError: No module named 'MaCroDNA'
naila53 commented 3 weeks ago

issue is solved in this pull request

I Also editied the macrodna.py to explicity call my WLS Gurobi paramets as i'm running it on a remote server, these info are stored in your license.lic file:

class MaCroDNA:
    def __init__(self, rna_df=None, dna_df=None, dna_label=None):
        # INPUT
        # rna_df, dna_df -- dataframe, cols are cell ids, index are genes
        # dna_label -- dataframe, two cols, one is the clone id "clone", the other is the cell id "cell"
        self.rna_df = rna_df
        self.dna_df = dna_df
        self.dna_label = dna_label

        # Set the Gurobi WLS license options
        self.gurobi_options = {
            "WLSACCESSID": "fml5p5trdmpfv;dmrp53550gvmfpeg",  # Replace with your actual WLSACCESSID
            "WLSSECRET": "fgrogrgmfpgrep5560tm",    # Replace with your actual WLSSECRET
            "LICENSEID": 12948568                             # Replace with your actual LICENSEID it'sa number
        }

    def cosine_similarity(self, x1, x2):
        return 1 - spatial.distance.cosine(x1 - x1.mean(), x2 - x2.mean())

    def cosine_similarity_np(self, x1, x2):
        return np.dot(x1 - x1.mean(), x2 - x2.mean()) / (1e-10 + norm(x1 - x1.mean()) * norm(x2 - x2.mean()))

    def ilp(self, rna_idx, dna_idx, corrs):
        n_min = min(rna_idx.shape[0], dna_idx.shape[0])
        print("the smallest set has %s number of cells" % (n_min))

        # Create a Gurobi environment using WLS credentials
        with gp.Env(params=self.gurobi_options) as env:
            # Create the Gurobi model within the environment
            with gp.Model(env=env) as m:
                # Create a 2-D array of binary variables
                # x[i,j]=1 means that cell i in RNA corresponds to cell j in DNA
                x = []
                for i in range(rna_idx.shape[0]):
                    x.append([])
                    for j in range(dna_idx.shape[0]):
                        x[i].append(m.addVar(vtype=GRB.BINARY, name="x[%d,%d]" % (i, j)))

                # Set the constraints for rows and columns
                for i in range(rna_idx.shape[0]):
                    # At most one cell per row
                    m.addConstr(quicksum([x[i][j] for j in range(dna_idx.shape[0])]) <= 1, name="row" + str(i))

                for j in range(dna_idx.shape[0]):
                    # At most one cell per column
                    m.addConstr(quicksum([x[i][j] for i in range(rna_idx.shape[0])]) <= 1, name="col" + str(i))

                m.addConstr(quicksum([x[i][j] for i in range(rna_idx.shape[0]) for j in range(dna_idx.shape[0])]) == n_min,
                            name="global")

                # Update the model
                m.update()

                # Create the linear expression
                obj = LinExpr()
                for i in range(rna_idx.shape[0]):
                    for j in range(dna_idx.shape[0]):
                        obj += x[i][j] * corrs[rna_idx[i]][dna_idx[j]]

                # Set the objective
                m.setObjective(obj, GRB.MAXIMIZE)

                # Optimize the model
                m.optimize()

                print('IsMIP: %d' % m.IsMIP)
                if m.status == GRB.Status.INFEASIBLE:
                    print("The model is infeasible")
                print("Solved with MIPFocus: %d" % m.Params.MIPFocus)
                print("The model has been optimized")
                print('Obj: %g' % m.objVal)

                results = np.empty((rna_idx.shape[0], dna_idx.shape[0]))
                for i in range(rna_idx.shape[0]):
                    for j in range(dna_idx.shape[0]):
                        results[i][j] = int(x[i][j].x)

        return results

    def cell2cell_assignment(self):
        dna_cells = list(self.dna_df.columns)
        rna_cells = list(self.rna_df.columns)
        genes = list(set(self.dna_df.index).intersection(self.rna_df.index.to_list()))
        self.dna_df = self.dna_df.loc[genes, :]
        self.rna_df = self.rna_df.loc[genes, :]

        dna_np = self.dna_df.T.to_numpy()
        rna_np = self.rna_df.T.to_numpy()
        print("number of cells in dna data %s" % (dna_np.shape[0]))
        print("number of cells in rna data %s" % (rna_np.shape[0]))
        print("number of genes in dna data %s" % (dna_np.shape[1]))
        print("number of genes in rna data %s" % (rna_np.shape[1]))

        # Create the array containing Pearson correlation coefficients between all possible pairs
        # The rows of the matrix correspond the rna cells and the columns correspond the dna cells
        corrs = np.empty((rna_np.shape[0], dna_np.shape[0]))

        for i in range(rna_np.shape[0]):
            for j in range(dna_np.shape[0]):
                corrs[i][j] = self.cosine_similarity_np(rna_np[i], dna_np[j])

        # Create a matrix for storing the global correspondences
        global_correspondence = np.zeros((rna_np.shape[0], dna_np.shape[0]))
        # Create a matrix of correspondence where the entries represent the steps
        tagged_correspondence = np.zeros((rna_np.shape[0], dna_np.shape[0]))
        # Create index lists for dna and rna
        global_dna_idx = np.array([i for i in range(dna_np.shape[0])])
        global_rna_idx = np.array([i for i in range(rna_np.shape[0])])

        # Calculate the number of steps
        quotient, remainder = divmod(global_rna_idx.shape[0], global_dna_idx.shape[0])
        print(quotient, remainder)
        n_iters = int(quotient)
        if remainder != 0:
            n_iters += 1
        print("MaCroDNA will be run for %s steps" % (n_iters))

        rna_idx = np.copy(global_rna_idx)

        # Iterations
        for step_ in range(n_iters):
            r = self.ilp(rna_idx, global_dna_idx, corrs)
            # Identify the indices with assignments
            r = np.array(r)
            for i in range(r.shape[0]):
                for j in range(r.shape[1]):
                    if r[i][j] == 1:
                        global_correspondence[rna_idx[i]][global_dna_idx[j]] = 1  # Binary correspondence matrix
                        tagged_correspondence[rna_idx[i]][global_dna_idx[j]] = step_ + 1  # Tagged correspondence matrix

            sums = np.sum(global_correspondence, axis=1)
            idx_remove = np.argwhere(sums == 1)
            idx_remove = np.squeeze(idx_remove)
            rna_idx = np.delete(global_rna_idx, idx_remove)  # Remove cells whose correspondence was found

        print("the number of associations in the correspondence matrix %s" % np.sum(global_correspondence))

        result_df = pd.DataFrame(data=global_correspondence, columns=dna_cells, index=rna_cells)
        tagged_df = pd.DataFrame(data=tagged_correspondence, columns=dna_cells, index=rna_cells)

        # Find the dna that map to rna
        result_rna = []
        result_dna = []
        rna_cells = list(result_df.index)
        dna_cells = list(result_df.columns)
        for d in rna_cells:
            tmp_rna = list(result_df.loc[d, :])
            tmp_rna_dna_index = tmp_rna.index(1)
            tmp_rna_dna = dna_cells[tmp_rna_dna_index]
            result_dna.append(tmp_rna_dna)
            result_rna.append(d)
        tmp_result = pd.DataFrame(list(zip(result_dna, result_rna)), columns=["predict_cell", "cell"])
        tmp_result = tmp_result.set_index("cell")

        # Change the format of the tagged correspondence matrix similar to the binary correspondence matrix
        result_rna_tagged = []
        result_dna_tagged = []
        result_tags = []
        for d in list(tagged_df.index):
            tmp_rna = list(tagged_df.loc[d, :])
            for step__ in range(n_iters):
                if (step__ + 1) in tmp_rna:
                    tmp_rna_dna_index = tmp_rna.index(step__ + 1)
                    tmp_rna_dna = dna_cells[tmp_rna_dna_index]
                    result_dna_tagged.append(tmp_rna_dna)
                    result_rna_tagged.append(d)
                    result_tags.append(step__ + 1)
        tmp_result_tagged = pd.DataFrame(list(zip(result_dna_tagged, result_rna_tagged, result_tags)),
                                         columns=["predict_cell", "cell", "step"])
        tmp_result_tagged = tmp_result_tagged.set_index("cell")

        return tmp_result, tmp_result_tagged

    def cell2clone_assignment(self):
        rna_result, _ = self.cell2cell_assignment()
        dna_cells = list(self.dna_df.columns)

        # Map to clone
        dna_label = self.dna_label
        dna_label = dna_label.set_index("cell")

        dna_result_label = dna_label.loc[rna_result["predict_cell"]]["clone"].tolist()
        rna_result["predict"] = dna_result_label
        return rna_result

    def tiny_test(self):
        dna_data = pd.DataFrame.from_dict({"cell1": [2, 2, 3, 1, 6, 2],
                                           "cell2": [2, 2, 2, 2, 2, 2],
                                           "cell3": [1, 1, 2, 2, 2, 3],
                                           "cell4": [2, 2, 2, 2, 2, 6],
                                           "gene": ["g1", "g2", "g3", "g4", "g5", "g6"]})
        dna_data = dna_data.set_index("gene")

        rna_data = pd.DataFrame.from_dict({"cell1": [0, 0, 10, 0, 20, 0, 0],
                                           "cell2": [2, 2, 2, 2, 2, 2, 0],
                                           "cell3": [0, 0, 2, 2, 0, 5, 0],
                                           "cell4": [1, 1, 1, 1, 1, 20, 0],
                                           "gene": ["g1", "g2", "g3", "g4", "g5", "g6", "g7"]})
        rna_data = rna_data.set_index("gene")

        dna_cluster = pd.DataFrame.from_dict({"clone": [0, 1, 2, 3],
                                              "cell": ["cell1", "cell2", "cell3", "cell4"]})

        print("******Test DNA data is:")
        print(dna_data)
        print("******Test RNA data is:")
        print(rna_data)
        print("******Clone id for each DNA cell is:")
        print(dna_cluster)
        print("**********")
        print("Start Mapping RNA cells to DNA clones")
        print("**********")

        self.dna_df = dna_data
        self.rna_df = rna_data
        self.dna_label = dna_cluster
        self.cell2clone_assignment()

        print("**********")
        print("Finish Mapping")
        print("Test Success")
        print("**********")

if __name__ == '__main__':
    test = MaCroDNA()
    test.tiny_test()