Here is the code provided in the article with the additional feature of converting random sequences into Fasta format.
#To begin, let us use the code developed by Schuster & Hirth (DOI: 10.1126/sciadv.ade8259)
import random
with open('sequences.txt', 'w') as f:
for length in [365, 247, 373]:
for i in range(50):
f.write("".join(random.choices(["G", "C", "A", "T"], weights=(0.2, 0.2, 0.4, 0.4), k=length)))
f.write("\n")
# Open the input file and output files
with open('sequences.txt', 'r') as input_file, \
open('sv_PAX2_random.txt', 'w') as sv_pax2, \
open('inv_EN2_random.txt', 'w') as inv_en2, \
open('dac_DACH1_random.txt', 'w') as dac_dach1:
for i, line in enumerate(input_file):
if i < 50 and len(line.strip()) == 365: # Check for first 50 lines with 365 characters
sv_pax2.write(line)
elif 50 <= i < 100 and len(line.strip()) == 247: # Lines 51–100
dac_dach1.write(line)
elif 100 <= i < 150 and len(line.strip()) == 373: # Lines 101–150
inv_en2.write(line)
#Now to make these sequences identifiable, we are going to turn these into a FASTA format. This will be done for each new .txt file
# Open the input text file containing genes
with open('dac_DACH1_random.txt', 'r') as input_file:
# Open an output file in FASTA format
with open('dac_DACH1_random_FASTA.txt', 'w') as output_file:
for i, sequence in enumerate(input_file):
sequence = sequence.strip() # Remove any extra whitespace or newline characters
if sequence: # Ensure it's not an empty line
# Write a header (e.g., >Gene_1, >Gene_2, etc.)
output_file.write(f">random_dac_DACH1_Sequence_{i+1}\n")
# Write the gene sequence
output_file.write(sequence + "\n")
with open('inv_EN2_random.txt', 'r') as input_file:
# Open an output file in FASTA format
with open('inv_EN2_random_FASTA.txt', 'w') as output_file:
for i, sequence in enumerate(input_file):
sequence = sequence.strip() # Remove any extra whitespace or newline characters
if sequence: # Ensure it's not an empty line
# Write a header (e.g., >Gene_1, >Gene_2, etc.)
output_file.write(f">random_inv_EN2_Sequence_{i+1}\n")
# Write the gene sequence
output_file.write(sequence + "\n")
with open('sv_PAX2_random.txt', 'r') as input_file:
# Open an output file in FASTA format
with open('sv_PAX2_random_FASTA.txt', 'w') as output_file:
for i, sequence in enumerate(input_file):
sequence = sequence.strip() # Remove any extra whitespace or newline characters
if sequence: # Ensure it's not an empty line
# Write a header (e.g., >Gene_1, >Gene_2, etc.)
output_file.write(f">random_sv_PAX2_Sequence_{i+1}\n")
# Write the gene sequence
output_file.write(sequence + "\n")