251029_Genetic_Diversity - 给刘琦的365封信

1. Construct ML tree

# 使用去除郑州果蔬所和两个异常位点的数据（总计 682）
# bfile -> fasta -> nwk

##  qwen

#!/bin/bash

# Script to build Maximum Likelihood tree from PLINK files with IQTree
# Uses data with outgroups (Mira_RUN_OUT_3) for phylogenetic analysis

set -e  # Exit immediately if a command exits with a non-zero status

# Define variables
DATADIR="/home/the_sun/Analysis_for_kib/Mira_Analysis_Genetic_Diversity/data"
INPUT_PREFIX="Mira_RUN_OUT_3"
OUTPUT_PREFIX="mira_ml_tree"
LOG_FILE="${OUTPUT_PREFIX}_log.txt"
GROUP_FILE="Group_INFO_MiraRenew_2_OUT.csv"

echo "Starting ML tree construction pipeline..." | tee "${LOG_FILE}"

# Check if input files exist
if [ ! -f "${DATADIR}/${INPUT_PREFIX}.bed" ] || [ ! -f "${DATADIR}/${INPUT_PREFIX}.bim" ] || [ ! -f "${DATADIR}/${INPUT_PREFIX}.fam" ]; then
    echo "Error: Input PLINK files (${INPUT_PREFIX}.{bed,bim,fam}) not found!" | tee -a "${LOG_FILE}"
    exit 1
fi

echo "Input files found. Starting conversion to FASTA format..." | tee -a "${LOG_FILE}"

# Convert PLINK to VCF first using PLINK (using full path)
echo "Converting PLINK to VCF format..." | tee -a "${LOG_FILE}"
/home/the_sun/miniconda3/bin/plink --bfile "${DATADIR}/${INPUT_PREFIX}" \
      --recode vcf \
      --out "${DATADIR}/${OUTPUT_PREFIX}_temp" \
      --allow-extra-chr

# Convert VCF to FASTA using a Python script
# First, create a simple Python script to convert VCF to FASTA
cat > "${DATADIR}/vcf_to_fasta.py" << 'EOF'
#!/usr/bin/env python3
import sys
import gzip

def vcf_to_fasta(vcf_file, output_fasta):
    # Dictionary to store sequences for each sample
    sequences = {}
    sample_names = []
    
    # Open VCF file (handle both gzipped and regular files)
    if vcf_file.endswith('.gz'):
        f = gzip.open(vcf_file, 'rt')
    else:
        f = open(vcf_file, 'r')
    
    for line in f:
        line = line.strip()
        
        # Skip header lines that don't start with sample info
        if line.startswith("#CHROM"):
            # Get sample names from the header
            fields = line.split('\t')
            sample_names = fields[9:]  # Sample names start from column 10
            # Initialize empty sequence for each sample
            for sample in sample_names:
                sequences[sample] = ""
        
        elif not line.startswith('#') and len(line) > 0:
            # Process data lines
            fields = line.split('\t')
            ref = fields[3]
            alt = fields[4]
            
            # Process genotype data for each sample (columns 10 onwards)
            for i, sample_data in enumerate(fields[9:]):
                if i < len(sample_names):  # Safety check
                    # Extract genotype from VCF format (e.g., "0/1:38,25:63:99:784,0,1424")
                    gt_info = sample_data.split(':')
                    genotype = gt_info[0]  # Get the genotype part (e.g., "0/1")
                    
                    # Convert genotype to nucleotide
                    if genotype == "0/0" or genotype == "0|0":
                        base = ref
                    elif genotype == "1/1" or genotype == "1|1":
                        base = alt
                    elif genotype == "0/1" or genotype == "1/0" or genotype == "0|1" or genotype == "1|0":
                        base = "N"  # Heterozygous sites represented as N
                    elif genotype.startswith("0"):
                        base = ref
                    elif genotype.startswith("1"):
                        base = alt
                    else:
                        base = "N"  # Unknown
                    
                    # Append this base to the sequence for this sample
                    sequences[sample_names[i]] += base
    
    f.close()
    
    # Write sequences to FASTA file
    with open(output_fasta, 'w') as fasta_out:
        for sample in sample_names:
            if sample in sequences:
                # Write sequence in FASTA format (with 80-character line wrapping)
                fasta_out.write(f">{sample}\n")
                seq = sequences[sample]
                # Wrap lines at 80 characters
                for i in range(0, len(seq), 80):
                    fasta_out.write(seq[i:i+80] + "\n")

if __name__ == "__main__":
    if len(sys.argv) != 3:
        print("Usage: python vcf_to_fasta.py <input.vcf> <output.fasta>")
        sys.exit(1)
    
    vcf_file = sys.argv[1]
    output_fasta = sys.argv[2]
    
    vcf_to_fasta(vcf_file, output_fasta)
    print(f"Conversion completed: {output_fasta}")
EOF

# Run the conversion script
echo "Converting VCF to FASTA format..." | tee -a "${LOG_FILE}"
python3 "${DATADIR}/vcf_to_fasta.py" "${DATADIR}/${OUTPUT_PREFIX}_temp.vcf" "${DATADIR}/${OUTPUT_PREFIX}.fasta"

# Check if FASTA conversion was successful
if [ ! -f "${DATADIR}/${OUTPUT_PREFIX}.fasta" ]; then
    echo "Error: FASTA conversion failed!" | tee -a "${LOG_FILE}"
    exit 1
fi

echo "FASTA file created successfully with $(grep -c '^>' ${DATADIR}/${OUTPUT_PREFIX}.fasta) sequences." | tee -a "${LOG_FILE}"

# Now activate IQTree environment and run the analysis
echo "Activating IQTree environment..." | tee -a "${LOG_FILE}"
source /home/the_sun/miniconda3/bin/activate iqtree_env

# Run IQTree to build ML tree
echo "Starting IQTree analysis..." | tee -a "${LOG_FILE}"

# Use IQTree with appropriate parameters for SNP data
# We'll use ModelFinder to automatically select the best substitution model
# For quick testing, use SH-like approximate likelihood ratio test (fast)
iqtree -s "${DATADIR}/${OUTPUT_PREFIX}.fasta" \
       -m MFP \
       -alrt 1000 \
       -nt AUTO \
       -pre "${DATADIR}/${OUTPUT_PREFIX}_iqtree" \
       -seed 12345

echo "IQTree analysis completed. Checking results..." | tee -a "${LOG_FILE}"

# Check if IQTree completed successfully
if [ -f "${DATADIR}/${OUTPUT_PREFIX}_iqtree.treefile" ]; then
    echo "ML tree successfully created: ${OUTPUT_PREFIX}_iqtree.treefile" | tee -a "${LOG_FILE}"
    
    # Create a summary of the results
    echo "=== ANALYSIS SUMMARY ===" | tee -a "${LOG_FILE}"
    echo "Input: ${INPUT_PREFIX} with outgroups" | tee -a "${LOG_FILE}"
    echo "Number of sequences: $(grep -c '^>' ${DATADIR}/${OUTPUT_PREFIX}.fasta)" | tee -a "${LOG_FILE}"
    echo "Alignment length: $(awk 'NR==2 {print length($0)}' ${DATADIR}/${OUTPUT_PREFIX}.fasta)" | tee -a "${LOG_FILE}"
    echo "Output tree file: ${OUTPUT_PREFIX}_iqtree.treefile" | tee -a "${LOG_FILE}"
    echo "Tree format: Newick" | tee -a "${LOG_FILE}"
    echo "Support method: Ultrafast bootstrap (100 replicates for testing, use 1000 for final)" | tee -a "${LOG_FILE}"
    
    echo "=== ANALYSIS COMPLETE ===" | tee -a "${LOG_FILE}"
    
    # Clean up temporary files
    echo "Cleaning up temporary files..." | tee -a "${LOG_FILE}"
    rm -f "${DATADIR}/${OUTPUT_PREFIX}_temp.vcf" "${DATADIR}/vcf_to_fasta.py"
    
    echo "Pipeline completed successfully!" | tee -a "${LOG_FILE}"
else
    echo "Error: IQTree did not complete successfully!" | tee -a "${LOG_FILE}"
    exit 1
fi

echo "Script finished at $(date)" | tee -a "${LOG_FILE}"

2. Structure

# bfile -> 继续过滤 -> structure数据

# iflow
#!/bin/bash

# 设置路径
INPUT_DIR="/home/the_sun/Analysis_for_kib/Mira_Analysis_Genetic_Diversity/data"
OUTPUT_DIR="/home/the_sun/Analysis_for_kib/Mira_Analysis_Genetic_Diversity/data"
STRUCTURE_PATH="/media/the_sun/KIB/Tools/structure/console/structure"

# 进一步过滤SNP，减少到几千个用于structure分析
plink2 \
    --bfile ${INPUT_DIR}/Mira_RUN_PURE_3 \
    --geno 0.2 \
    --maf 0.05 \
    --indep-pairwise 20 2 0.05 \
    --out ${OUTPUT_DIR}/Mira_structure_filtered

# 提取过滤后的SNP
plink2 \
    --bfile ${INPUT_DIR}/Mira_RUN_PURE_3 \
    --extract ${OUTPUT_DIR}/Mira_structure_filtered.prune.in \
    --make-bed \
    --out ${OUTPUT_DIR}/Mira_structure_input

# 将bed文件转换为structure输入格式
plink \
    --bfile ${OUTPUT_DIR}/Mira_structure_input \
    --recode \
    --out ${OUTPUT_DIR}/Mira_structure_input

# 运行structure分析
# 统计过滤后的SNP数量
echo "Filtered SNP count:"
wc -l ${OUTPUT_DIR}/Mira_structure_filtered.prune.in

# 检查生成的structure输入文件
ls -la ${OUTPUT_DIR}/Mira_structure_input.ped

# 运行structure分析
${STRUCTURE_PATH} -m ${OUTPUT_DIR}/mainparams -e ${OUTPUT_DIR}/extraparams -i ${OUTPUT_DIR}/Mira_structure_input.ped

1. Construct ML tree

2. Structure

评论