#!/usr/bin/perl use strict; use warnings; use Getopt::Long; use LWP::Simple; use XML::Simple; use Bio::Seq; use Bio::SeqIO; my ( $genome, $coordinate, $fi, $fo, $help ); GetOptions( "g|genome=s" => \$genome, "c|coordinate=s" => \$coordinate, "i|input=s" => \$fi, "o|output=s" => \$fo, "h|help" => \$help ); if ($help) { print STDOUT ; exit(0); } $genome = $genome ? $genome : "hg19"; my $mode; if ( ($coordinate) && ( !$fi ) ) { print STDERR "Using single mode...\n"; #1 for single mode $mode = 1; } if ( ( !$coordinate ) && ($fi) ) { print STDERR "Using batch mode...\n"; #0 for batch mode $mode = 0; } if ( ( !$coordinate ) && ( !$fi ) ) { print STDOUT ; exit(0); } if ( ($coordinate) && ($fi) ) { print STDERR "You can use single OR batch mode, but not both!\n"; exit(0); } my $out; #Save results to a file, or print them to the STDOUT if ($fo) { $out = Bio::SeqIO->new( -file => ">$fo", -format => "Fasta" ); } else { $out = Bio::SeqIO->new( -fh => \*STDOUT, -format => "Fasta" ); } if ($mode) { #Single mode if ( $coordinate =~ /(chr.+):(\d+)-(\d+)/ ) { my $record = &process( $genome, $1, $2, $3 ); $out->write_seq($record); } else { print STDOUT ; exit(0); } } else { #Batch mode open my $IN, '<', $fi or die "$0 : failed to open input file '$fi' : $!\n"; while (<$IN>) { chomp; my ( $chr_in, $start_in, $stop_in, @null ) = split /\t/; my $record = &process( $genome, $chr_in, $start_in, $stop_in ); $out->write_seq($record); } close $IN or warn "$0 : failed to close input file '$fi' : $!\n"; } sub process { my ( $g, $c, $s, $e ) = @_; #Get the XML results using DAS from UCSC my $url = "http://genome.ucsc.edu/cgi-bin/das/$g/dna?segment=$c:$s,$e"; my $page = get($url); #Parse the XML results my $xml = new XML::Simple; my $data = $xml->XMLin("$page"); my $chr = $data->{SEQUENCE}->{id}; my $start = $data->{SEQUENCE}->{start}; my $stop = $data->{SEQUENCE}->{stop}; my $length = $data->{SEQUENCE}->{DNA}->{length}; my $seq = uc( $data->{SEQUENCE}->{DNA}->{content} ); $seq =~ s/\n//g; my $id = "$chr:$start-$stop"; my $desc = "genome=$genome;length=$length"; my $record = Bio::Seq->new( -display_id => "$id", -desc => "$desc", -seq => "$seq", -moltype => "dna" ); return ($record); } __DATA__ Program: fetchGenomeSequence.pl (v20110514) Author: Yixf (xfyin@sibs.ac.cn) Summary: Fetch subsequence(s) from a specified genome according to coordinate(s). Usage: intersectGene.pl [OPTIONS] Options: -g, --genome Specify a genome.[default: hg19] -c, --coordinate The coordinate of a region in "chr1:10-20" format. -i, --input The input file containing many coordinates. -o, --output The output file to save FASTA result(s). -h, --help Print this help message. Notes: 1. You can use -c (coordinate for single mode) OR -i (input file for batch mode), but not both at the same time. 2. Unless you use -o, the results will be printed to the STDOUT[terminal]. 3. If you use -i, please pay attention to: 1) The delimiter in the file MUST be TAB. 2) The file must have 3 columns at least. 3) The first 3 columns MUST be "Chromosome,Start,Stop". 4) Columns after the third column will be ignored. 5) An example: "chr1\t10\t20". 4. The coordinate uses 1-based system. (For more information, please refer to: http://yixf.name/2011/03/26/%E5%9F%BA%E5%9B%A0%E7%BB%84%E7%9A%84%E5%9D%90%E6%A0%87%E7%B3%BB%E7%BB%9F%EF%BC%9A0-based%E4%B8%8E1-based/)