1. Construct ML tree
# 使用去除郑州果蔬所和两个异常位点的数据(总计 682)
# bfile -> fasta -> nwk
## qwen
#!/bin/bash
# Script to build Maximum Likelihood tree from PLINK files with IQTree
# Uses data with outgroups (Mira_RUN_OUT_3) for phylogenetic analysis
set -e # Exit immediately if a command exits with a non-zero status
# Define variables
DATADIR="/home/the_sun/Analysis_for_kib/Mira_Analysis_Genetic_Diversity/data"
INPUT_PREFIX="Mira_RUN_OUT_3"
OUTPUT_PREFIX="mira_ml_tree"
LOG_FILE="${OUTPUT_PREFIX}_log.txt"
GROUP_FILE="Group_INFO_MiraRenew_2_OUT.csv"
echo "Starting ML tree construction pipeline..." | tee "${LOG_FILE}"
# Check if input files exist
if [ ! -f "${DATADIR}/${INPUT_PREFIX}.bed" ] || [ ! -f "${DATADIR}/${INPUT_PREFIX}.bim" ] || [ ! -f "${DATADIR}/${INPUT_PREFIX}.fam" ]; then
echo "Error: Input PLINK files (${INPUT_PREFIX}.{bed,bim,fam}) not found!" | tee -a "${LOG_FILE}"
exit 1
fi
echo "Input files found. Starting conversion to FASTA format..." | tee -a "${LOG_FILE}"
# Convert PLINK to VCF first using PLINK (using full path)
echo "Converting PLINK to VCF format..." | tee -a "${LOG_FILE}"
/home/the_sun/miniconda3/bin/plink --bfile "${DATADIR}/${INPUT_PREFIX}" \
--recode vcf \
--out "${DATADIR}/${OUTPUT_PREFIX}_temp" \
--allow-extra-chr
# Convert VCF to FASTA using a Python script
# First, create a simple Python script to convert VCF to FASTA
cat > "${DATADIR}/vcf_to_fasta.py" << 'EOF'
#!/usr/bin/env python3
import sys
import gzip
def vcf_to_fasta(vcf_file, output_fasta):
# Dictionary to store sequences for each sample
sequences = {}
sample_names = []
# Open VCF file (handle both gzipped and regular files)
if vcf_file.endswith('.gz'):
f = gzip.open(vcf_file, 'rt')
else:
f = open(vcf_file, 'r')
for line in f:
line = line.strip()
# Skip header lines that don't start with sample info
if line.startswith("#CHROM"):
# Get sample names from the header
fields = line.split('\t')
sample_names = fields[9:] # Sample names start from column 10
# Initialize empty sequence for each sample
for sample in sample_names:
sequences[sample] = ""
elif not line.startswith('#') and len(line) > 0:
# Process data lines
fields = line.split('\t')
ref = fields[3]
alt = fields[4]
# Process genotype data for each sample (columns 10 onwards)
for i, sample_data in enumerate(fields[9:]):
if i < len(sample_names): # Safety check
# Extract genotype from VCF format (e.g., "0/1:38,25:63:99:784,0,1424")
gt_info = sample_data.split(':')
genotype = gt_info[0] # Get the genotype part (e.g., "0/1")
# Convert genotype to nucleotide
if genotype == "0/0" or genotype == "0|0":
base = ref
elif genotype == "1/1" or genotype == "1|1":
base = alt
elif genotype == "0/1" or genotype == "1/0" or genotype == "0|1" or genotype == "1|0":
base = "N" # Heterozygous sites represented as N
elif genotype.startswith("0"):
base = ref
elif genotype.startswith("1"):
base = alt
else:
base = "N" # Unknown
# Append this base to the sequence for this sample
sequences[sample_names[i]] += base
f.close()
# Write sequences to FASTA file
with open(output_fasta, 'w') as fasta_out:
for sample in sample_names:
if sample in sequences:
# Write sequence in FASTA format (with 80-character line wrapping)
fasta_out.write(f">{sample}\n")
seq = sequences[sample]
# Wrap lines at 80 characters
for i in range(0, len(seq), 80):
fasta_out.write(seq[i:i+80] + "\n")
if __name__ == "__main__":
if len(sys.argv) != 3:
print("Usage: python vcf_to_fasta.py <input.vcf> <output.fasta>")
sys.exit(1)
vcf_file = sys.argv[1]
output_fasta = sys.argv[2]
vcf_to_fasta(vcf_file, output_fasta)
print(f"Conversion completed: {output_fasta}")
EOF
# Run the conversion script
echo "Converting VCF to FASTA format..." | tee -a "${LOG_FILE}"
python3 "${DATADIR}/vcf_to_fasta.py" "${DATADIR}/${OUTPUT_PREFIX}_temp.vcf" "${DATADIR}/${OUTPUT_PREFIX}.fasta"
# Check if FASTA conversion was successful
if [ ! -f "${DATADIR}/${OUTPUT_PREFIX}.fasta" ]; then
echo "Error: FASTA conversion failed!" | tee -a "${LOG_FILE}"
exit 1
fi
echo "FASTA file created successfully with $(grep -c '^>' ${DATADIR}/${OUTPUT_PREFIX}.fasta) sequences." | tee -a "${LOG_FILE}"
# Now activate IQTree environment and run the analysis
echo "Activating IQTree environment..." | tee -a "${LOG_FILE}"
source /home/the_sun/miniconda3/bin/activate iqtree_env
# Run IQTree to build ML tree
echo "Starting IQTree analysis..." | tee -a "${LOG_FILE}"
# Use IQTree with appropriate parameters for SNP data
# We'll use ModelFinder to automatically select the best substitution model
# For quick testing, use SH-like approximate likelihood ratio test (fast)
iqtree -s "${DATADIR}/${OUTPUT_PREFIX}.fasta" \
-m MFP \
-alrt 1000 \
-nt AUTO \
-pre "${DATADIR}/${OUTPUT_PREFIX}_iqtree" \
-seed 12345
echo "IQTree analysis completed. Checking results..." | tee -a "${LOG_FILE}"
# Check if IQTree completed successfully
if [ -f "${DATADIR}/${OUTPUT_PREFIX}_iqtree.treefile" ]; then
echo "ML tree successfully created: ${OUTPUT_PREFIX}_iqtree.treefile" | tee -a "${LOG_FILE}"
# Create a summary of the results
echo "=== ANALYSIS SUMMARY ===" | tee -a "${LOG_FILE}"
echo "Input: ${INPUT_PREFIX} with outgroups" | tee -a "${LOG_FILE}"
echo "Number of sequences: $(grep -c '^>' ${DATADIR}/${OUTPUT_PREFIX}.fasta)" | tee -a "${LOG_FILE}"
echo "Alignment length: $(awk 'NR==2 {print length($0)}' ${DATADIR}/${OUTPUT_PREFIX}.fasta)" | tee -a "${LOG_FILE}"
echo "Output tree file: ${OUTPUT_PREFIX}_iqtree.treefile" | tee -a "${LOG_FILE}"
echo "Tree format: Newick" | tee -a "${LOG_FILE}"
echo "Support method: Ultrafast bootstrap (100 replicates for testing, use 1000 for final)" | tee -a "${LOG_FILE}"
echo "=== ANALYSIS COMPLETE ===" | tee -a "${LOG_FILE}"
# Clean up temporary files
echo "Cleaning up temporary files..." | tee -a "${LOG_FILE}"
rm -f "${DATADIR}/${OUTPUT_PREFIX}_temp.vcf" "${DATADIR}/vcf_to_fasta.py"
echo "Pipeline completed successfully!" | tee -a "${LOG_FILE}"
else
echo "Error: IQTree did not complete successfully!" | tee -a "${LOG_FILE}"
exit 1
fi
echo "Script finished at $(date)" | tee -a "${LOG_FILE}"
2. Structure
# bfile -> 继续过滤 -> structure数据
# iflow
#!/bin/bash
# 设置路径
INPUT_DIR="/home/the_sun/Analysis_for_kib/Mira_Analysis_Genetic_Diversity/data"
OUTPUT_DIR="/home/the_sun/Analysis_for_kib/Mira_Analysis_Genetic_Diversity/data"
STRUCTURE_PATH="/media/the_sun/KIB/Tools/structure/console/structure"
# 进一步过滤SNP,减少到几千个用于structure分析
plink2 \
--bfile ${INPUT_DIR}/Mira_RUN_PURE_3 \
--geno 0.2 \
--maf 0.05 \
--indep-pairwise 20 2 0.05 \
--out ${OUTPUT_DIR}/Mira_structure_filtered
# 提取过滤后的SNP
plink2 \
--bfile ${INPUT_DIR}/Mira_RUN_PURE_3 \
--extract ${OUTPUT_DIR}/Mira_structure_filtered.prune.in \
--make-bed \
--out ${OUTPUT_DIR}/Mira_structure_input
# 将bed文件转换为structure输入格式
plink \
--bfile ${OUTPUT_DIR}/Mira_structure_input \
--recode \
--out ${OUTPUT_DIR}/Mira_structure_input
# 运行structure分析
# 统计过滤后的SNP数量
echo "Filtered SNP count:"
wc -l ${OUTPUT_DIR}/Mira_structure_filtered.prune.in
# 检查生成的structure输入文件
ls -la ${OUTPUT_DIR}/Mira_structure_input.ped
# 运行structure分析
${STRUCTURE_PATH} -m ${OUTPUT_DIR}/mainparams -e ${OUTPUT_DIR}/extraparams -i ${OUTPUT_DIR}/Mira_structure_input.ped