{ "nbformat": 4, "nbformat_minor": 0, "metadata": { "colab": { "name": "Kmer_estimates.ipynb", "provenance": [] }, "kernelspec": { "name": "python3", "display_name": "Python 3" }, "language_info": { "name": "python" } }, "cells": [ { "cell_type": "code", "source": [ "from google.colab import drive\n", "drive.mount('/content/drive')" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "dh_zq3usUmdR", "outputId": "ef58d2da-f89b-4934-912f-4aa99f0006be" }, "execution_count": 4, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Mounted at /content/drive\n" ] } ] }, { "cell_type": "code", "metadata": { "id": "sBajg6MXF9xk" }, "source": [ "import os" ], "execution_count": 3, "outputs": [] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "v1s4oLVAH4TS", "outputId": "f597750f-07d9-4e5f-fa44-08dcf2d8b262" }, "source": [ "#Directory to the data\n", "directory = \"/content/drive/MyDrive/AMR/Data/Scaffold_fasta/\"\n", "\n", "#Final matrix which will contain the file name in first row,\n", "#number of seperate scaffold fasta sequences in second row,\n", "#and maximum number of k-mers in third row\n", "final = [[],[],[]]\n", "#itterate through every file in the directory\n", "for filename in os.listdir(directory):\n", " file_path = directory + filename + \"/\" + filename + \"_scaffolds.fasta\" #Obtain the full path of each file\n", " lengths = [] #Initialize empty vector of sequence lengths\n", "#As each file contains multiple scaffold sequences as opposed to a single genome sequence\n", "#We will need to obtain the length of each sequence in the file\n", "#Then the number of entries, or len(lengths) will be the number of scaffold sequences in the file\n", "#While the sum will be the total length. \n", " with open(file_path,\"r\") as file: #Open the file and itterate line by line\n", " for line in file:\n", " if line[0]=='>': #If the file contains a \">\", that means it is a line\n", " #which contains the length of the following sequence\n", " startindex = line.find(\"length_\") + 7 #Find the position where the length starts\n", " stopindex = line.find(\"_cov\") #Find the position where the length ends\n", " length = int(line[startindex:stopindex])#Convert the length to an integer\n", " lengths.append(length) #Add length to vector of sequence lengths\n", " #print(len(lengths)) #Lines used during testing of code commented out\n", " #print(sum(lengths))\n", " #append the needed values to the final matrix\n", " final[0].append(filename)\n", " final[1].append(len(lengths))\n", " final[2].append(sum(lengths)-30) #the minus 30 give the number of 31-mers as\n", " #The maximum # of k-mers is length-(k-1)\n", "print(final)" ], "execution_count": 6, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "[['SRR8737536', 'SRR8737538', 'SRR8737590', 'SRR8737593', 'SRR8737589', 'SRR8737594', 'SRR8737595', 'SRR8737588', 'SRR8737591', 'SRR8737537', 'SRR8737544', 'SRR8737548', 'SRR8737539', 'SRR8737542', 'SRR8737549', 'SRR8737547', 'SRR8737545', 'SRR8737541', 'SRR8737543', 'SRR8737546', 'SRR8737558', 'SRR8737559', 'SRR8737555', 'SRR8737556', 'SRR8737551', 'SRR8737550', 'SRR8737552', 'SRR8737557', 'SRR8737554', 'SRR8737553', 'SRR8737569', 'SRR8737565', 'SRR8737560', 'SRR8737564', 'SRR8737566', 'SRR8737561', 'SRR8737567', 'SRR8737568', 'SRR8737562', 'SRR8737563', 'SRR8737577', 'SRR8737570', 'SRR8737575', 'SRR8737573', 'SRR8737576', 'SRR8737579', 'SRR8737578', 'SRR8737572', 'SRR8737574', 'SRR8737571', 'SRR8737583', 'SRR8737582', 'SRR8737584', 'SRR8737587', 'SRR8737580', 'SRR8737581', 'SRR8737585', 'SRR8737586'], [105, 281, 179, 85, 187, 224, 96, 61, 191, 253, 93, 125, 128, 194, 181, 67, 51, 170, 681, 49, 178, 115, 1289, 230, 368, 65, 123, 117, 182, 547, 79, 134, 176, 66, 54, 221, 145, 137, 75, 101, 327, 148, 143, 88, 135, 250, 246, 152, 179, 181, 152, 163, 58, 131, 276, 111, 66, 123], [6383506, 7031416, 6639898, 6702035, 6794394, 7047165, 6677481, 6299582, 6803703, 6798108, 6656349, 6824094, 6737718, 7283196, 7116969, 6318507, 6689528, 6903885, 7296571, 6669503, 6808141, 6890166, 7430776, 7079287, 7240576, 6772607, 6335341, 6974358, 7058500, 7123755, 6957354, 6524397, 6933804, 6558003, 6531491, 6783762, 6821803, 6944308, 6358044, 6434451, 6896369, 6932158, 6913986, 6362473, 6792614, 7021255, 7063224, 6691150, 6913933, 6728952, 6363440, 6302582, 6676803, 6905426, 6974226, 6730539, 6425546, 6406556]]\n" ] } ] }, { "cell_type": "markdown", "metadata": { "id": "42rGdY6NXGLp" }, "source": [ "Thus we have 105 reads in first file with a total of 6,383,536 monomers. 6,383,536-(31-1) = 6,383,506 31-mers" ] } ] }