Gentics Book Data Scraper
import scrapy
class QuotesSpider(scrapy.Spider):
name = "quotes"
start_urls = [
'https://www.ncbi.nlm.nih.gov/books/NBK1517/#brugada.molgen.TA',
]
def parse(self, response):
A = response.xpath('//div[@id="__brugada.molgen.TA_lrgtbl__"]')
for row in A.xpath('table/tbody/tr'):
gene = row.xpath('td[1]/a/i/text()').extract_first()
gene_link = row.xpath('td[1]/a/@href').extract_first()
chromosome = row.xpath('td[2]/a/text()').extract_first()
chromosome_link = row.xpath('td[2]/a/@href').extract_first()
protein = row.xpath('td[3]/a/text()').extract_first()
protein_link = row.xpath('td[3]/a/@href').extract_first()
locous_specific = row.xpath('td[4]/a/text()').extract_first()
hgdm = row.xpath('td[5]/a/text()').extract_first()
hgdm_link = row.xpath('td[5]/a/@href').extract_first()
yield {
'gene':gene,
'gene_link':'https://www.ncbi.nlm.nih.gov' + str(gene_link),
'chromosome':chromosome,
'chromosome_link':chromosome_link,
'protein':protein,
'protein_link':protein_link,
'locous':locous_specific,
'hgdm':hgdm,
'hgdm_link':hgdm_link
}
Brugada Pubmed
import scrapy
from scrapy.spiders import XMLFeedSpider
id = []
class Brugadapubmed(XMLFeedSpider):
name = "brugadapubmed"
start_urls = [
'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&term=brugada+syndrome+review&usehistory=y&retmax=657',
]
def parse(self, response):
idss = ""
for link in response.selector.xpath('.//Id/text()').extract():
id.append(link)
idss +=str(link)+','
url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?"+'db=pubmed'+'&id='+idss[:len(idss)-1]+'&retmode=xml'
return scrapy.Request(url,callback=self.parse_page2,method='GET')
def parse_page2(self, response):
print('In parse method')
for link in response.selector.xpath('.//PubmedArticle'):
yield {
'Article Id': link.xpath('.//PMID/text()').extract(),
'Article Title':link.xpath('.//Article//ArticleTitle/text()').extract(),
'Author': link.xpath('.//Article//ArticleTitle//AuthorList//Author//ForeName/text()').extract(),
'Abstract':link.xpath('.//Article//Abstract//AbstractText/text()').extract(),
}
NCBI Fasta Work
from Bio import SeqIO
from Bio.SeqRecord import SeqRecord
from Bio.Alphabet import generic_protein
from Bio.Blast import NCBIWWW
import xlrd
import random
mut = []
for seq_record in SeqIO.parse("scn5a.fasta", "fasta"):
m_seq = seq_record.seq.tomutable()
sh = xlrd.open_workbook('snp_export').sheet_by_index(0)
for rownum in range(sh.nrows):
mut.append(sh.cell(rownum,2).value)
a = random.choice(list(mut))
x = a[1:3]
if m_seq[x] == a[0]:
m_seq[x]=a[4]
print(m_seq[125])
rec = SeqRecord(Seq(str(m_seq), generic_protein),id = seq_record.id,description = seq_record.description)
SeqIO.write(rec,"mutated_scn5a.fasta","fasta")
fasta_string = open("mutated_scn5a.fasta").read()
result_handle = NCBIWWW.qblast("tblastn", "nt", fasta_string)
print(result_handle)