Overview

昨天跟Chris讨论SVM分类预测准确性的时候,知道PSSM_AC的作用比PSSM作用更明显,于是决定将以前的python脚本改进一下,输出PSSMPSSM_AC这两个文件,方便观察。该脚本包括两部分,本文将按顺序记录下来。
以前的脚本可以参考我之前的文章蛋白质序列特征提取方法之——PSSM

1. t34pssm.py

#! /usr/bin/env python
import fileinput
import sys
from os import listdir
from os.path import isfile, join
import re
from pssm_ft import *


smplfasta = sys.argv[1]  
spfasta = sys.argv[2]   
check_head = re.compile(r'\>')  

#read from undersample fasta, store 
smplist = []
smpcnt = 0
for line, strin in enumerate(fileinput.input(smplfasta)):
    if check_head.match(strin):
        smplist.append(strin.strip())
        smpcnt += 1
onlyfiles = [ f for f in listdir(spfasta) if isfile(join(spfasta,f)) ]

fastaDict = {}

for fi in onlyfiles:
    cntnt = ''
    for line, strin in enumerate(fileinput.input(spfasta+'/'+fi)):
        if line == 0:
            cntnt += strin.strip()
        if cntnt in fastaDict:
            print strin
    fastaDict[cntnt] = fi

    pssmdir = sys.argv[3]
    finalist = []
    for smp in smplist:
        finalist.append(pssmdir+'/'+fastaDict[smp].split('.')[0]+'.pssm')

    for fi in finalist:
        pssm(fi, 'total_train_60.pssm','total_train_60.pssm_AC') #此处多了一个参数,多了一个输出文件

2. pssm_ft.py

import sys
import numpy as np
import math
import re
import fileinput 

GROUP = 10

def scale(x, max, min):
"""
scale x to [-1,1]
"""
    try:
        return -(1-(x-min)/(max-min))+((x-min)/(max-min))
    except Exception:
        print "ZeroDivisionError, max-min:\s", max-min

def pssm_ac_cal(PSSM, g, l):
"""
:param PSSM: ['1','-1'...]
:param g: group
:param l: sequence length
:return PSSM_AC: g * 20 matrix
"""
    PSSM_AC = np.array([ [0.0] * 20 ] * g)

    for pg in range(g):
        l_g = l - pg - 1
        for pj in range(20):
            sum_jl = 0.0
            for i in range(l):
                sum_jl += PSSM[i][pj]
            sum_jl /= l
        
            pssm_acjg = 0.0
            for i in range(l_g):
                pssm_acjg += (PSSM[i][pj]-sum_jl) * (PSSM[i+pg+1][pj]-sum_jl)
            pssm_acjg /= l_g
            PSSM_AC[pg][pj] = pssm_acjg
    return PSSM_AC

def pssm(fi,output,output_AC):
    # 0-19 represents amino acid 'ARNDCQEGHILKMFPSTWYV'
    Amino_vec = "ARNDCQEGHILKMFPSTWYV"
    PSSM = []
    ACC = [ [0.0] * 20 ] * 20
    seq_cn = 0
    for line, strin in enumerate(fileinput.input(fi)):
        if line > 2:
            str_vec = strin.split()[1:22]
            if len(str_vec) == 0:
                break
            PSSM.append(map(int, str_vec[1:]))
            seq_cn += 1
            #test1_1 += float(str_vec[1]) test1_1 = -103
            ACC[Amino_vec.index(str_vec[0])] = map(sum, zip(map(int, str_vec[1:]), 
                ACC[Amino_vec.index(str_vec[0])]))  
    fileinput.close()
    # section for ACC features 
    ACC_np = np.array(ACC)
    ACC_np = np.divide(ACC_np, seq_cn)

    amcnt_max = np.amax(ACC_np)
    amcnt_min = np.amin(ACC_np)

    vfunc = np.vectorize(scale)
    ACC_np = vfunc(ACC_np, amcnt_max,amcnt_min)
    ACC_shp = np.shape(ACC_np)

    #section for PSSM_AC features
    PSSM = np.array(PSSM)
    PSSM_AC = pssm_ac_cal(PSSM, GROUP, seq_cn)
    PSSM_shp = np.shape(PSSM_AC)

    file_out = file(output,'a')
    file_out_AC = file(output_AC,'a')
    #此处将保存文件的代码拆分开
    np.savetxt(file_out, [np.reshape(ACC_np, (ACC_shp[0] * ACC_shp[1], ))], delimiter=",")
    np.savetxt(file_out_AC, [np.reshape(PSSM_AC, (PSSM_shp[0] * PSSM_shp[1], ))], delimiter=",")