Extracting lines based on comma-separated string in another file and write extracted lines to file
Solution 1:
Assuming the sequence data ends in a single line (without extending over multiple lines), how about an awk solution:
awk -F'\t' '
NR==FNR { # process SAMPLE.fasta file
if (FNR % 2) { # odd line with contigID
len = split($0, a, "|") # extract the contigID
id = a[len]
seq[id] = $0 # assign seq[id] to the line
} else { # even line with sequence
seq[id] = seq[id] RS $0 # append sequence to seq[id]
}
next
}
{ # process contigIDs file
fname = $1 ".fasta" # filename to write
len = split($2, a, ",") # split the contigIDs
for (i = 1; i <= len; i++) {
split(a[i], b, "|") # extract the contigID
if (b[3] in seq) { # if the sequence is found
print seq[b[3]] > fname # then print it to the file
}
}
close(fname)
}
' SAMPLE.fasta contigIDs
Output:
424182.1.fasta file:
>H|S1|C933685
GAAAGTTCTTGACCTGTGGACAGGCTGTGAATCGGGTTGGACAAGT
1217675.1.fasta file:
>H|S1|C85072
GGAAACGGCTGCTGCCATCCTTGCCCTTCGCCCAAG
>H|S1|C965427
CTCAAGAAATTCGGTATCACCGGTAACTATGAGGCAGTCGAGGTCG
Solution 2:
Try:
import pandas as pd
# STEP-1: load sample data and create a Series
data = {}
with open('SAMPLE.fasta') as fp:
for line in fp:
if line.startswith('>'):
id_ = line[1:].strip()
else:
data[id_] = line.strip()
sr = pd.Series(data)
# STEP-2: load the list of genome id and create a DataFrame
df = pd.read_table('data.tsv', header=None, names=['genomeID', 'contigIDs'])
df = df.assign(contigIDs=df['contigIDs'].str.split(',')).explode('contigIDs')
# STEP-3: map your series with your dataframe
df = df.assign(Seq=df['contigIDs'].map(sr)).dropna()
# STEP-4: create your files
for filename, df1 in df.groupby('genomeID'):
with open(f"{filename}.fasta", 'w') as fp:
for _, row in df1.iterrows():
fp.write(f">{row['contigIDs']}\n{row['Seq']}\n")
Output:
# Content of 424182.1.fasta
>H|S1|C933685
GAAAGTTCTTGACCTGTGGACAGGCTGTGAATCGGGTTGGACAAGT
# Content of 1217675.1.fasta
>H|S1|C85072
GGAAACGGCTGCTGCCATCCTTGCCCTTCGCCCAAG
>H|S1|C965427
CTCAAGAAATTCGGTATCACCGGTAACTATGAGGCAGTCGAGGTCG