Open andrewshvv opened 3 years ago
Created a very first prototype:
class GCContentVariance(EnforceGCContent):
def __init__(self, allowed_gc_difference, window, location):
self.allowed_gc_difference = allowed_gc_difference
super(GCContentVariance, self).__init__(window=window,
location=location)
def evaluate(self, problem):
"""Return the sum of breaches extent for all windowed breaches."""
wstart, wend = self.location.start, self.location.end
sequence = self.location.extract_sequence(problem.sequence)
gc = gc_content(sequence, window_size=self.window)
# Create enumerated gc matrix of (index, gc content) to preserver the
# index after sorting
gc = np.array(list(enumerate(gc)))
# Sort by gc content
gc = gc[np.argsort(gc[:, 1])]
max_index, max_gc = gc[len(gc) - 1]
min_index, min_gc = gc[0]
max_difference = max_gc - min_gc
breaches = []
score = 0
if max_difference > self.allowed_gc_difference:
min_allowed_gc = max_gc - self.allowed_gc_difference
# Places where gc content is low enough to be not allowed
gc_content_is_too_low = gc[:, 1] < min_allowed_gc
breaches = gc[gc_content_is_too_low][:, 0]
# Add max index as breach as well
breaches = np.append(breaches, max_index)
# Use all breaches as score, but let only the maximum breach to
# be out.
score = -len(breaches)
breaches = np.array([max_index, min_index])
breaches_starts = wstart + breaches
breaches_locations = []
message = "Passed!"
if len(breaches_starts) != 0:
desc = "GC content difference over %s." % \
int(self.allowed_gc_difference * 100)
for index in breaches_starts:
breaches_locations.append([
ProblemLocation(start=index,
end=index + self.window,
description=desc)
])
return SpecEvaluation(
self, problem, score, locations=breaches_locations, message=message
)
Thanks very much, this is an interesting synthesis constraint that may need to be added. I'll try / look into your suggestion sometime during this week.
I had a look at the example; among other issues, in your code you call a ProblemLocation
with its description=desc
which is not part of DNA Chisel. Do you have a working version by any chance? I'm involved with other projects so I wouldn't be able to build this class from scratch now, but happy to review and merge a working example.
My two cents on this is that this would be a great constraint to support since Twist is enforcing it, but it might also be a tricky to resolve, in particular for sequences with many separate extreme-GC regions. An alternative approach is to use a simple EnforceGCContent
with bounds that are 50% apart. This is not completely equivalent, but still a solid solution:
def limit_gc_variance(sequence, radius=0.5, window=50, location):
average_gc = gc_content(sequence)
return EnforceGCContent(
mini=average_gc - radius/2,
maxi = average_gc + radius / 2,
window=window,
location=location
)
@Zulko Unfortunately, it is not gonna work for us, because our constraint check should be 100% similar to Twist so that we don't end up with a false-positive situation. We delegate resolving the issue with separate extreme-GC regions to our customers, similar to the Twist, so if that is the case they would just not order the DNA.
@veghp Yeah, I have working code, I will get back to it when I finish working on my current task.
Last working prototype. One note, currently it returns only two breach locations with max and min GC, similar to Twist.
class GCContentVariance(BreachMixin, dc.Specification):
"""Specification on the variance of G/C nucleotides.
Parameters
----------
allowed_gc_difference
Maximum allowed difference in GC content between two
sub-sequences, length of which is defined by window size.
window
Length of the sliding window, in nucleotides, for local GC content.
If not provided, the global GC content of the whole sequence is
considered
location
Location objet indicating that the Specification only applies to a
subsegment of the sequence. Make sure it is bigger than ``window``
if both parameters are provided
"""
def __init__(self, allowed_gc_difference, window, location=None, **kwargs):
self.allowed_gc_difference = allowed_gc_difference
self.window = window
self.location = location
super().__init__(**kwargs)
def initialized_on_problem(self, problem, role=None):
return self._copy_with_full_span_if_no_location(problem)
def evaluate(self, problem):
"""Return the sum of breaches extent for all windowed breaches."""
wstart, wend = self.location.start, self.location.end
sequence = self.location.extract_sequence(problem.sequence)
gc = gc_content(sequence, window_size=self.window)
breaches_starts = []
score = 0
if len(gc) != 0:
# Create enumerated gc matrix of (index, gc content) to preserver
# the index after sorting
gc = np.array(list(enumerate(gc)))
# Sort by gc content
gc = gc[np.argsort(gc[:, 1])]
max_index, max_gc = gc[len(gc) - 1]
min_index, min_gc = gc[0]
max_difference = max_gc - min_gc
if max_difference > self.allowed_gc_difference:
min_allowed_gc = max_gc - self.allowed_gc_difference
# Places where gc content is low enough to be not allowed
gc_content_is_too_low = gc[:, 1] < min_allowed_gc
breaches = gc[gc_content_is_too_low][:, 0]
# Add max index as breach as well
breaches = np.append(breaches, max_index)
# Use all breaches as score, but let only the maximum breach to
# be out.
score = -len(breaches)
breaches = np.array([max_index, min_index])
breaches_starts = wstart + breaches
breaches_locations = []
message = "Passed!"
if len(breaches_starts) != 0:
message = "Didn't pass!"
for index in breaches_starts:
breaches_locations.append(
Location(start=index,
end=index + self.window)
)
return dc.SpecEvaluation(
self, problem, score, locations=breaches_locations, message=message
)
Last working prototype.
Thanks for sharing the code!
our constraint check should be 100% similar to Twist so that we don't end up with a false-positive situation
Nitpicking (and making sure I understand this right) but the constraint I suggested above is a stronger constraint so you wouldn't have false positives from the simulated supplier, only false negative.
We delegate resolving the issue with separate extreme-GC regions to our customers
Do I understand correctly that you actually don't need Chisel to solve the GC problems, you just need a constraint for DNA Weaver suppliers? In DNA Weaver a constraint could be just a function, or a class on the model of these classes. Again, apologies for the lack of documentation, any contribution appreciated :D
returns only two breach locations with max and min GC, similar to Twist.
Not a big fan of their approach as it forces to iterate (submit the sequence, get two issues, resubmit, get other issues, etc.). In DNA Chisel, all problematic regions are returned at once, the contract is "these are all the regions where mutations could make the constraint pass 100% for the sequence".
On the contrary, I use DNA chisel mostly for validation and getting the proper messages, the dna weawer constraint is just a wrapper. I just wanted everything to be consistent with the library. Example:
# validate_synthesis checks weather the sequences passes the
# synthesis constraints, and return problem specifications.
def validate_synthesis(
seq: SeqRecord,
synthesis_constraints: list[Specification],
):
problems = []
evaluations = DnaOptimizationProblem(
sequence=seq,
constraints=synthesis_constraints,
).constraints_evaluations()
problems += [e.specification for e in evaluations if not e.passes]
return problems
def synthesis_constraints(
avoid_enzymes_names: list[str] = None,
gg_enzyme_name: str = None,
config: SynthesisConstraintsConfig = twist_config
) -> list[Specification]:
constraints = [
LengthConstraint(
max_length=config.SYNTHESIS_MAX_LENGTH,
min_length=config.SYNTHESIS_MIN_LENGTH,
breach_desc="Sequence length more than %s or "
"less than %s" % (config.SYNTHESIS_MAX_LENGTH,
config.SYNTHESIS_MIN_LENGTH)
),
# Twist doesn't allow repeats of more than 20 length
AvoidPattern(
pattern=RepeatedKmerPattern(2, config.REPEATED_KMER_LEN),
breach_desc="Long direct repeat (>=20bp) detected. "
"Please break up repeats. Fewer/lower homology "
"repeats will be optimal for success."
),
# Twist avoid global GC content being more
# than 65% and less than 25%.
EnforceGCContent(
maxi=config.GLOBAL_MAX_GC_CONTENT,
mini=config.GLOBAL_MIN_GC_CONTENT,
breach_desc="Sequence has more than 70% or less than 25% "
"global GC content.",
),
GCContentVariance(
allowed_gc_difference=config.ALLOWED_GC_CONTENT_DIFFERENCE,
window=config.GC_CONTENT_WINDOW,
breach_desc="We located two 50 bp windows with a GC content "
"difference over 52%. "
),
AvoidPattern(
pattern="%dxA" % config.MAX_HOMOPOLYMER_LENGTH,
breach_desc="Homopolymer patter of 10 or more A's have been "
"found. Long homopolymer stretches increase "
"complexity. Please break up long homopolymers.",
),
AvoidPattern(
pattern="%dxT" % config.MAX_HOMOPOLYMER_LENGTH,
breach_desc="Homopolymer patter of 10 or more T's have been "
"found. Long homopolymer stretches increase "
"complexity. Please break up long homopolymers."
),
AvoidPattern(
pattern="%dxG" % config.MAX_HOMOPOLYMER_LENGTH,
breach_desc="Homopolymer patter of 10 or more G's have been "
"found. Long homopolymer stretches increase "
"complexity. Please break up long homopolymers.",
),
AvoidPattern(
pattern="%dxC" % config.MAX_HOMOPOLYMER_LENGTH,
breach_desc="Homopolymer patter of 10 or more C's have "
"been found. Long homopolymer stretches increase "
"complexity. Please break up long homopolymers."
),
]
if gg_enzyme_name:
# Golden gate enzyme is encountered at flanks two times.
allowed_times = 2
constraints.append(
RepeatedPattern(
allowed_times=allowed_times,
pattern=EnzymeSitePattern(gg_enzyme_name),
breach_desc="Enzyme %s site have been found "
"more than %s times." % (gg_enzyme_name,
allowed_times)
))
avoid_enzymes_names = avoid_enzymes_names if avoid_enzymes_names else []
for enzyme in avoid_enzymes_names:
constraints.append(
AvoidPattern(pattern=EnzymeSitePattern(enzyme),
breach_desc="Enzyme %s site have "
"been found." % enzyme)
)
return constraints
# TwistValidator is used for dnaweawer.
class TwistValidator(object):
def __init__(self, config: SynthesisConstraintsConfig = twist_config,
avoid_enzymes: list[str] = None,
gg_enzyme: str = None):
self.config = config
self.avoid_enzymes_names = avoid_enzymes
self.gg_enzyme_name = gg_enzyme
def __call__(self, sequence):
constraints = synthesis_constraints(
avoid_enzymes_names=self.avoid_enzymes_names,
gg_enzyme_name=self.gg_enzyme_name,
config=self.config,
)
problems = validate_synthesis(seq=sequence,
synthesis_constraints=constraints)
return len(problems) == 0
Twist Bioscience has a GC variance constraint. Quote from their website:
If a sequence with such abnormality is loaded in their interface the following error occurs:
Using DNA Chisel + DNA Weaver without such constraint might lead to the construction of non-valid sequences.