{
  "nbformat": 4,
  "nbformat_minor": 0,
  "metadata": {
    "colab": {
      "name": "Kmer_estimates.ipynb",
      "provenance": []
    },
    "kernelspec": {
      "name": "python3",
      "display_name": "Python 3"
    },
    "language_info": {
      "name": "python"
    }
  },
  "cells": [
    {
      "cell_type": "code",
      "source": [
        "from google.colab import drive\n",
        "drive.mount('/content/drive')"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "dh_zq3usUmdR",
        "outputId": "ef58d2da-f89b-4934-912f-4aa99f0006be"
      },
      "execution_count": 4,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Mounted at /content/drive\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "sBajg6MXF9xk"
      },
      "source": [
        "import os"
      ],
      "execution_count": 3,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "v1s4oLVAH4TS",
        "outputId": "f597750f-07d9-4e5f-fa44-08dcf2d8b262"
      },
      "source": [
        "#Directory to the data\n",
        "directory = \"/content/drive/MyDrive/AMR/Data/Scaffold_fasta/\"\n",
        "\n",
        "#Final matrix which will contain the file name in first row,\n",
        "#number of seperate scaffold fasta sequences in second row,\n",
        "#and maximum number of k-mers in third row\n",
        "final =  [[],[],[]]\n",
        "#itterate through every file in the directory\n",
        "for filename in os.listdir(directory):\n",
        "  file_path = directory + filename + \"/\" + filename + \"_scaffolds.fasta\"     #Obtain the full path of each file\n",
        "  lengths = []                                  #Initialize empty vector of sequence lengths\n",
        "#As each file contains multiple scaffold sequences as opposed to a single genome sequence\n",
        "#We will need to obtain the length of each sequence in the file\n",
        "#Then the number of entries, or len(lengths) will be the number of scaffold sequences in the file\n",
        "#While the sum will be the total length.                                                \n",
        "  with open(file_path,\"r\") as file:             #Open the file and itterate line by line\n",
        "    for line in file:\n",
        "      if line[0]=='>':                          #If the file contains a \">\", that means it is a line\n",
        "                                                #which contains the length of the following sequence\n",
        "        startindex = line.find(\"length_\") + 7   #Find the position where the length starts\n",
        "        stopindex = line.find(\"_cov\")           #Find the position where the length ends\n",
        "        length = int(line[startindex:stopindex])#Convert the length to an integer\n",
        "        lengths.append(length)                  #Add length to vector of sequence lengths\n",
        "  #print(len(lengths)) #Lines used during testing of code commented out\n",
        "  #print(sum(lengths))\n",
        "  #append the needed values to the final matrix\n",
        "  final[0].append(filename)\n",
        "  final[1].append(len(lengths))\n",
        "  final[2].append(sum(lengths)-30) #the minus 30 give the number of 31-mers as\n",
        "                                   #The maximum # of k-mers is length-(k-1)\n",
        "print(final)"
      ],
      "execution_count": 6,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "[['SRR8737536', 'SRR8737538', 'SRR8737590', 'SRR8737593', 'SRR8737589', 'SRR8737594', 'SRR8737595', 'SRR8737588', 'SRR8737591', 'SRR8737537', 'SRR8737544', 'SRR8737548', 'SRR8737539', 'SRR8737542', 'SRR8737549', 'SRR8737547', 'SRR8737545', 'SRR8737541', 'SRR8737543', 'SRR8737546', 'SRR8737558', 'SRR8737559', 'SRR8737555', 'SRR8737556', 'SRR8737551', 'SRR8737550', 'SRR8737552', 'SRR8737557', 'SRR8737554', 'SRR8737553', 'SRR8737569', 'SRR8737565', 'SRR8737560', 'SRR8737564', 'SRR8737566', 'SRR8737561', 'SRR8737567', 'SRR8737568', 'SRR8737562', 'SRR8737563', 'SRR8737577', 'SRR8737570', 'SRR8737575', 'SRR8737573', 'SRR8737576', 'SRR8737579', 'SRR8737578', 'SRR8737572', 'SRR8737574', 'SRR8737571', 'SRR8737583', 'SRR8737582', 'SRR8737584', 'SRR8737587', 'SRR8737580', 'SRR8737581', 'SRR8737585', 'SRR8737586'], [105, 281, 179, 85, 187, 224, 96, 61, 191, 253, 93, 125, 128, 194, 181, 67, 51, 170, 681, 49, 178, 115, 1289, 230, 368, 65, 123, 117, 182, 547, 79, 134, 176, 66, 54, 221, 145, 137, 75, 101, 327, 148, 143, 88, 135, 250, 246, 152, 179, 181, 152, 163, 58, 131, 276, 111, 66, 123], [6383506, 7031416, 6639898, 6702035, 6794394, 7047165, 6677481, 6299582, 6803703, 6798108, 6656349, 6824094, 6737718, 7283196, 7116969, 6318507, 6689528, 6903885, 7296571, 6669503, 6808141, 6890166, 7430776, 7079287, 7240576, 6772607, 6335341, 6974358, 7058500, 7123755, 6957354, 6524397, 6933804, 6558003, 6531491, 6783762, 6821803, 6944308, 6358044, 6434451, 6896369, 6932158, 6913986, 6362473, 6792614, 7021255, 7063224, 6691150, 6913933, 6728952, 6363440, 6302582, 6676803, 6905426, 6974226, 6730539, 6425546, 6406556]]\n"
          ]
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "42rGdY6NXGLp"
      },
      "source": [
        "Thus we have 105 reads in first file with a total of 6,383,536 monomers. 6,383,536-(31-1) = 6,383,506 31-mers"
      ]
    }
  ]
}