diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..823b334 --- /dev/null +++ b/.gitignore @@ -0,0 +1,7 @@ +falmeida_py.egg-info/ +dist +falmeida_py/__pycache__ +build/* +docs/_build +PHD_FILES +test* \ No newline at end of file diff --git a/.gitpod.yml b/.gitpod.yml new file mode 100644 index 0000000..a565aba --- /dev/null +++ b/.gitpod.yml @@ -0,0 +1,14 @@ +image: nfcore/gitpod:latest + +vscode: + extensions: # based on nf-core.nf-core-extensionpack + - codezombiech.gitignore # Language support for .gitignore files + # - cssho.vscode-svgviewer # SVG viewer + - esbenp.prettier-vscode # Markdown/CommonMark linting and style checking for Visual Studio Code + - eamodio.gitlens # Quickly glimpse into whom, why, and when a line or code block was changed + - EditorConfig.EditorConfig # override user/workspace settings with settings found in .editorconfig files + - Gruntfuggly.todo-tree # Display TODO and FIXME in a tree view in the activity bar + - mechatroner.rainbow-csv # Highlight columns in csv files in different colors + # - nextflow.nextflow # Nextflow syntax highlighting + - oderwat.indent-rainbow # Highlight indentation level + - streetsidesoftware.code-spell-checker # Spelling checker for source code \ No newline at end of file diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 0000000..12ff2fd --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,3 @@ +{ + "restructuredtext.confPath": "${workspaceFolder}/docs" +} \ No newline at end of file diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 0000000..70413f1 --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,23 @@ +# Changelog + +Started only in version 0.5 + +## v1.2.3 + +- Added ICEberg and PHAST results to bacannot json summary file + +## v1.2.2 + +- Added a file size check, only enter bacannot summary creation if results files are not empty +- Fixed mob_suite summary parsing (added .astype(str)) +- Moved MGE dict key generation for inside the integron_finder loop + +## v1.2.1 + +- Added a quick fix for plasmid finder results parsing, as results from gram-negative have only one database, but for gram-positive it has >1. + +## v0.5 + +### hotfix + +A small fix has been performed, the blast databases now in the scripts are created without the `-parse_seqids` option because it was bringing problem in the subset module (`align2subsetgbk`). \ No newline at end of file diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..f288702 --- /dev/null +++ b/LICENSE @@ -0,0 +1,674 @@ + GNU GENERAL PUBLIC LICENSE + Version 3, 29 June 2007 + + Copyright (C) 2007 Free Software Foundation, Inc. + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + + Preamble + + The GNU General Public License is a free, copyleft license for +software and other kinds of works. + + The licenses for most software and other practical works are designed +to take away your freedom to share and change the works. By contrast, +the GNU General Public License is intended to guarantee your freedom to +share and change all versions of a program--to make sure it remains free +software for all its users. We, the Free Software Foundation, use the +GNU General Public License for most of our software; it applies also to +any other work released this way by its authors. You can apply it to +your programs, too. + + When we speak of free software, we are referring to freedom, not +price. Our General Public Licenses are designed to make sure that you +have the freedom to distribute copies of free software (and charge for +them if you wish), that you receive source code or can get it if you +want it, that you can change the software or use pieces of it in new +free programs, and that you know you can do these things. + + To protect your rights, we need to prevent others from denying you +these rights or asking you to surrender the rights. Therefore, you have +certain responsibilities if you distribute copies of the software, or if +you modify it: responsibilities to respect the freedom of others. + + For example, if you distribute copies of such a program, whether +gratis or for a fee, you must pass on to the recipients the same +freedoms that you received. You must make sure that they, too, receive +or can get the source code. And you must show them these terms so they +know their rights. + + Developers that use the GNU GPL protect your rights with two steps: +(1) assert copyright on the software, and (2) offer you this License +giving you legal permission to copy, distribute and/or modify it. + + For the developers' and authors' protection, the GPL clearly explains +that there is no warranty for this free software. For both users' and +authors' sake, the GPL requires that modified versions be marked as +changed, so that their problems will not be attributed erroneously to +authors of previous versions. + + Some devices are designed to deny users access to install or run +modified versions of the software inside them, although the manufacturer +can do so. This is fundamentally incompatible with the aim of +protecting users' freedom to change the software. The systematic +pattern of such abuse occurs in the area of products for individuals to +use, which is precisely where it is most unacceptable. Therefore, we +have designed this version of the GPL to prohibit the practice for those +products. If such problems arise substantially in other domains, we +stand ready to extend this provision to those domains in future versions +of the GPL, as needed to protect the freedom of users. + + Finally, every program is threatened constantly by software patents. +States should not allow patents to restrict development and use of +software on general-purpose computers, but in those that do, we wish to +avoid the special danger that patents applied to a free program could +make it effectively proprietary. To prevent this, the GPL assures that +patents cannot be used to render the program non-free. + + The precise terms and conditions for copying, distribution and +modification follow. + + TERMS AND CONDITIONS + + 0. Definitions. + + "This License" refers to version 3 of the GNU General Public License. + + "Copyright" also means copyright-like laws that apply to other kinds of +works, such as semiconductor masks. + + "The Program" refers to any copyrightable work licensed under this +License. Each licensee is addressed as "you". "Licensees" and +"recipients" may be individuals or organizations. + + To "modify" a work means to copy from or adapt all or part of the work +in a fashion requiring copyright permission, other than the making of an +exact copy. The resulting work is called a "modified version" of the +earlier work or a work "based on" the earlier work. + + A "covered work" means either the unmodified Program or a work based +on the Program. + + To "propagate" a work means to do anything with it that, without +permission, would make you directly or secondarily liable for +infringement under applicable copyright law, except executing it on a +computer or modifying a private copy. Propagation includes copying, +distribution (with or without modification), making available to the +public, and in some countries other activities as well. + + To "convey" a work means any kind of propagation that enables other +parties to make or receive copies. Mere interaction with a user through +a computer network, with no transfer of a copy, is not conveying. + + An interactive user interface displays "Appropriate Legal Notices" +to the extent that it includes a convenient and prominently visible +feature that (1) displays an appropriate copyright notice, and (2) +tells the user that there is no warranty for the work (except to the +extent that warranties are provided), that licensees may convey the +work under this License, and how to view a copy of this License. If +the interface presents a list of user commands or options, such as a +menu, a prominent item in the list meets this criterion. + + 1. Source Code. + + The "source code" for a work means the preferred form of the work +for making modifications to it. "Object code" means any non-source +form of a work. + + A "Standard Interface" means an interface that either is an official +standard defined by a recognized standards body, or, in the case of +interfaces specified for a particular programming language, one that +is widely used among developers working in that language. + + The "System Libraries" of an executable work include anything, other +than the work as a whole, that (a) is included in the normal form of +packaging a Major Component, but which is not part of that Major +Component, and (b) serves only to enable use of the work with that +Major Component, or to implement a Standard Interface for which an +implementation is available to the public in source code form. A +"Major Component", in this context, means a major essential component +(kernel, window system, and so on) of the specific operating system +(if any) on which the executable work runs, or a compiler used to +produce the work, or an object code interpreter used to run it. + + The "Corresponding Source" for a work in object code form means all +the source code needed to generate, install, and (for an executable +work) run the object code and to modify the work, including scripts to +control those activities. However, it does not include the work's +System Libraries, or general-purpose tools or generally available free +programs which are used unmodified in performing those activities but +which are not part of the work. For example, Corresponding Source +includes interface definition files associated with source files for +the work, and the source code for shared libraries and dynamically +linked subprograms that the work is specifically designed to require, +such as by intimate data communication or control flow between those +subprograms and other parts of the work. + + The Corresponding Source need not include anything that users +can regenerate automatically from other parts of the Corresponding +Source. + + The Corresponding Source for a work in source code form is that +same work. + + 2. Basic Permissions. + + All rights granted under this License are granted for the term of +copyright on the Program, and are irrevocable provided the stated +conditions are met. This License explicitly affirms your unlimited +permission to run the unmodified Program. The output from running a +covered work is covered by this License only if the output, given its +content, constitutes a covered work. This License acknowledges your +rights of fair use or other equivalent, as provided by copyright law. + + You may make, run and propagate covered works that you do not +convey, without conditions so long as your license otherwise remains +in force. You may convey covered works to others for the sole purpose +of having them make modifications exclusively for you, or provide you +with facilities for running those works, provided that you comply with +the terms of this License in conveying all material for which you do +not control copyright. Those thus making or running the covered works +for you must do so exclusively on your behalf, under your direction +and control, on terms that prohibit them from making any copies of +your copyrighted material outside their relationship with you. + + Conveying under any other circumstances is permitted solely under +the conditions stated below. Sublicensing is not allowed; section 10 +makes it unnecessary. + + 3. Protecting Users' Legal Rights From Anti-Circumvention Law. + + No covered work shall be deemed part of an effective technological +measure under any applicable law fulfilling obligations under article +11 of the WIPO copyright treaty adopted on 20 December 1996, or +similar laws prohibiting or restricting circumvention of such +measures. + + When you convey a covered work, you waive any legal power to forbid +circumvention of technological measures to the extent such circumvention +is effected by exercising rights under this License with respect to +the covered work, and you disclaim any intention to limit operation or +modification of the work as a means of enforcing, against the work's +users, your or third parties' legal rights to forbid circumvention of +technological measures. + + 4. Conveying Verbatim Copies. + + You may convey verbatim copies of the Program's source code as you +receive it, in any medium, provided that you conspicuously and +appropriately publish on each copy an appropriate copyright notice; +keep intact all notices stating that this License and any +non-permissive terms added in accord with section 7 apply to the code; +keep intact all notices of the absence of any warranty; and give all +recipients a copy of this License along with the Program. + + You may charge any price or no price for each copy that you convey, +and you may offer support or warranty protection for a fee. + + 5. Conveying Modified Source Versions. + + You may convey a work based on the Program, or the modifications to +produce it from the Program, in the form of source code under the +terms of section 4, provided that you also meet all of these conditions: + + a) The work must carry prominent notices stating that you modified + it, and giving a relevant date. + + b) The work must carry prominent notices stating that it is + released under this License and any conditions added under section + 7. This requirement modifies the requirement in section 4 to + "keep intact all notices". + + c) You must license the entire work, as a whole, under this + License to anyone who comes into possession of a copy. This + License will therefore apply, along with any applicable section 7 + additional terms, to the whole of the work, and all its parts, + regardless of how they are packaged. This License gives no + permission to license the work in any other way, but it does not + invalidate such permission if you have separately received it. + + d) If the work has interactive user interfaces, each must display + Appropriate Legal Notices; however, if the Program has interactive + interfaces that do not display Appropriate Legal Notices, your + work need not make them do so. + + A compilation of a covered work with other separate and independent +works, which are not by their nature extensions of the covered work, +and which are not combined with it such as to form a larger program, +in or on a volume of a storage or distribution medium, is called an +"aggregate" if the compilation and its resulting copyright are not +used to limit the access or legal rights of the compilation's users +beyond what the individual works permit. Inclusion of a covered work +in an aggregate does not cause this License to apply to the other +parts of the aggregate. + + 6. Conveying Non-Source Forms. + + You may convey a covered work in object code form under the terms +of sections 4 and 5, provided that you also convey the +machine-readable Corresponding Source under the terms of this License, +in one of these ways: + + a) Convey the object code in, or embodied in, a physical product + (including a physical distribution medium), accompanied by the + Corresponding Source fixed on a durable physical medium + customarily used for software interchange. + + b) Convey the object code in, or embodied in, a physical product + (including a physical distribution medium), accompanied by a + written offer, valid for at least three years and valid for as + long as you offer spare parts or customer support for that product + model, to give anyone who possesses the object code either (1) a + copy of the Corresponding Source for all the software in the + product that is covered by this License, on a durable physical + medium customarily used for software interchange, for a price no + more than your reasonable cost of physically performing this + conveying of source, or (2) access to copy the + Corresponding Source from a network server at no charge. + + c) Convey individual copies of the object code with a copy of the + written offer to provide the Corresponding Source. This + alternative is allowed only occasionally and noncommercially, and + only if you received the object code with such an offer, in accord + with subsection 6b. + + d) Convey the object code by offering access from a designated + place (gratis or for a charge), and offer equivalent access to the + Corresponding Source in the same way through the same place at no + further charge. You need not require recipients to copy the + Corresponding Source along with the object code. If the place to + copy the object code is a network server, the Corresponding Source + may be on a different server (operated by you or a third party) + that supports equivalent copying facilities, provided you maintain + clear directions next to the object code saying where to find the + Corresponding Source. Regardless of what server hosts the + Corresponding Source, you remain obligated to ensure that it is + available for as long as needed to satisfy these requirements. + + e) Convey the object code using peer-to-peer transmission, provided + you inform other peers where the object code and Corresponding + Source of the work are being offered to the general public at no + charge under subsection 6d. + + A separable portion of the object code, whose source code is excluded +from the Corresponding Source as a System Library, need not be +included in conveying the object code work. + + A "User Product" is either (1) a "consumer product", which means any +tangible personal property which is normally used for personal, family, +or household purposes, or (2) anything designed or sold for incorporation +into a dwelling. In determining whether a product is a consumer product, +doubtful cases shall be resolved in favor of coverage. For a particular +product received by a particular user, "normally used" refers to a +typical or common use of that class of product, regardless of the status +of the particular user or of the way in which the particular user +actually uses, or expects or is expected to use, the product. A product +is a consumer product regardless of whether the product has substantial +commercial, industrial or non-consumer uses, unless such uses represent +the only significant mode of use of the product. + + "Installation Information" for a User Product means any methods, +procedures, authorization keys, or other information required to install +and execute modified versions of a covered work in that User Product from +a modified version of its Corresponding Source. The information must +suffice to ensure that the continued functioning of the modified object +code is in no case prevented or interfered with solely because +modification has been made. + + If you convey an object code work under this section in, or with, or +specifically for use in, a User Product, and the conveying occurs as +part of a transaction in which the right of possession and use of the +User Product is transferred to the recipient in perpetuity or for a +fixed term (regardless of how the transaction is characterized), the +Corresponding Source conveyed under this section must be accompanied +by the Installation Information. But this requirement does not apply +if neither you nor any third party retains the ability to install +modified object code on the User Product (for example, the work has +been installed in ROM). + + The requirement to provide Installation Information does not include a +requirement to continue to provide support service, warranty, or updates +for a work that has been modified or installed by the recipient, or for +the User Product in which it has been modified or installed. Access to a +network may be denied when the modification itself materially and +adversely affects the operation of the network or violates the rules and +protocols for communication across the network. + + Corresponding Source conveyed, and Installation Information provided, +in accord with this section must be in a format that is publicly +documented (and with an implementation available to the public in +source code form), and must require no special password or key for +unpacking, reading or copying. + + 7. Additional Terms. + + "Additional permissions" are terms that supplement the terms of this +License by making exceptions from one or more of its conditions. +Additional permissions that are applicable to the entire Program shall +be treated as though they were included in this License, to the extent +that they are valid under applicable law. If additional permissions +apply only to part of the Program, that part may be used separately +under those permissions, but the entire Program remains governed by +this License without regard to the additional permissions. + + When you convey a copy of a covered work, you may at your option +remove any additional permissions from that copy, or from any part of +it. (Additional permissions may be written to require their own +removal in certain cases when you modify the work.) You may place +additional permissions on material, added by you to a covered work, +for which you have or can give appropriate copyright permission. + + Notwithstanding any other provision of this License, for material you +add to a covered work, you may (if authorized by the copyright holders of +that material) supplement the terms of this License with terms: + + a) Disclaiming warranty or limiting liability differently from the + terms of sections 15 and 16 of this License; or + + b) Requiring preservation of specified reasonable legal notices or + author attributions in that material or in the Appropriate Legal + Notices displayed by works containing it; or + + c) Prohibiting misrepresentation of the origin of that material, or + requiring that modified versions of such material be marked in + reasonable ways as different from the original version; or + + d) Limiting the use for publicity purposes of names of licensors or + authors of the material; or + + e) Declining to grant rights under trademark law for use of some + trade names, trademarks, or service marks; or + + f) Requiring indemnification of licensors and authors of that + material by anyone who conveys the material (or modified versions of + it) with contractual assumptions of liability to the recipient, for + any liability that these contractual assumptions directly impose on + those licensors and authors. + + All other non-permissive additional terms are considered "further +restrictions" within the meaning of section 10. If the Program as you +received it, or any part of it, contains a notice stating that it is +governed by this License along with a term that is a further +restriction, you may remove that term. If a license document contains +a further restriction but permits relicensing or conveying under this +License, you may add to a covered work material governed by the terms +of that license document, provided that the further restriction does +not survive such relicensing or conveying. + + If you add terms to a covered work in accord with this section, you +must place, in the relevant source files, a statement of the +additional terms that apply to those files, or a notice indicating +where to find the applicable terms. + + Additional terms, permissive or non-permissive, may be stated in the +form of a separately written license, or stated as exceptions; +the above requirements apply either way. + + 8. Termination. + + You may not propagate or modify a covered work except as expressly +provided under this License. Any attempt otherwise to propagate or +modify it is void, and will automatically terminate your rights under +this License (including any patent licenses granted under the third +paragraph of section 11). + + However, if you cease all violation of this License, then your +license from a particular copyright holder is reinstated (a) +provisionally, unless and until the copyright holder explicitly and +finally terminates your license, and (b) permanently, if the copyright +holder fails to notify you of the violation by some reasonable means +prior to 60 days after the cessation. + + Moreover, your license from a particular copyright holder is +reinstated permanently if the copyright holder notifies you of the +violation by some reasonable means, this is the first time you have +received notice of violation of this License (for any work) from that +copyright holder, and you cure the violation prior to 30 days after +your receipt of the notice. + + Termination of your rights under this section does not terminate the +licenses of parties who have received copies or rights from you under +this License. If your rights have been terminated and not permanently +reinstated, you do not qualify to receive new licenses for the same +material under section 10. + + 9. Acceptance Not Required for Having Copies. + + You are not required to accept this License in order to receive or +run a copy of the Program. Ancillary propagation of a covered work +occurring solely as a consequence of using peer-to-peer transmission +to receive a copy likewise does not require acceptance. However, +nothing other than this License grants you permission to propagate or +modify any covered work. These actions infringe copyright if you do +not accept this License. Therefore, by modifying or propagating a +covered work, you indicate your acceptance of this License to do so. + + 10. Automatic Licensing of Downstream Recipients. + + Each time you convey a covered work, the recipient automatically +receives a license from the original licensors, to run, modify and +propagate that work, subject to this License. You are not responsible +for enforcing compliance by third parties with this License. + + An "entity transaction" is a transaction transferring control of an +organization, or substantially all assets of one, or subdividing an +organization, or merging organizations. If propagation of a covered +work results from an entity transaction, each party to that +transaction who receives a copy of the work also receives whatever +licenses to the work the party's predecessor in interest had or could +give under the previous paragraph, plus a right to possession of the +Corresponding Source of the work from the predecessor in interest, if +the predecessor has it or can get it with reasonable efforts. + + You may not impose any further restrictions on the exercise of the +rights granted or affirmed under this License. For example, you may +not impose a license fee, royalty, or other charge for exercise of +rights granted under this License, and you may not initiate litigation +(including a cross-claim or counterclaim in a lawsuit) alleging that +any patent claim is infringed by making, using, selling, offering for +sale, or importing the Program or any portion of it. + + 11. Patents. + + A "contributor" is a copyright holder who authorizes use under this +License of the Program or a work on which the Program is based. The +work thus licensed is called the contributor's "contributor version". + + A contributor's "essential patent claims" are all patent claims +owned or controlled by the contributor, whether already acquired or +hereafter acquired, that would be infringed by some manner, permitted +by this License, of making, using, or selling its contributor version, +but do not include claims that would be infringed only as a +consequence of further modification of the contributor version. For +purposes of this definition, "control" includes the right to grant +patent sublicenses in a manner consistent with the requirements of +this License. + + Each contributor grants you a non-exclusive, worldwide, royalty-free +patent license under the contributor's essential patent claims, to +make, use, sell, offer for sale, import and otherwise run, modify and +propagate the contents of its contributor version. + + In the following three paragraphs, a "patent license" is any express +agreement or commitment, however denominated, not to enforce a patent +(such as an express permission to practice a patent or covenant not to +sue for patent infringement). To "grant" such a patent license to a +party means to make such an agreement or commitment not to enforce a +patent against the party. + + If you convey a covered work, knowingly relying on a patent license, +and the Corresponding Source of the work is not available for anyone +to copy, free of charge and under the terms of this License, through a +publicly available network server or other readily accessible means, +then you must either (1) cause the Corresponding Source to be so +available, or (2) arrange to deprive yourself of the benefit of the +patent license for this particular work, or (3) arrange, in a manner +consistent with the requirements of this License, to extend the patent +license to downstream recipients. "Knowingly relying" means you have +actual knowledge that, but for the patent license, your conveying the +covered work in a country, or your recipient's use of the covered work +in a country, would infringe one or more identifiable patents in that +country that you have reason to believe are valid. + + If, pursuant to or in connection with a single transaction or +arrangement, you convey, or propagate by procuring conveyance of, a +covered work, and grant a patent license to some of the parties +receiving the covered work authorizing them to use, propagate, modify +or convey a specific copy of the covered work, then the patent license +you grant is automatically extended to all recipients of the covered +work and works based on it. + + A patent license is "discriminatory" if it does not include within +the scope of its coverage, prohibits the exercise of, or is +conditioned on the non-exercise of one or more of the rights that are +specifically granted under this License. You may not convey a covered +work if you are a party to an arrangement with a third party that is +in the business of distributing software, under which you make payment +to the third party based on the extent of your activity of conveying +the work, and under which the third party grants, to any of the +parties who would receive the covered work from you, a discriminatory +patent license (a) in connection with copies of the covered work +conveyed by you (or copies made from those copies), or (b) primarily +for and in connection with specific products or compilations that +contain the covered work, unless you entered into that arrangement, +or that patent license was granted, prior to 28 March 2007. + + Nothing in this License shall be construed as excluding or limiting +any implied license or other defenses to infringement that may +otherwise be available to you under applicable patent law. + + 12. No Surrender of Others' Freedom. + + If conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not +excuse you from the conditions of this License. If you cannot convey a +covered work so as to satisfy simultaneously your obligations under this +License and any other pertinent obligations, then as a consequence you may +not convey it at all. For example, if you agree to terms that obligate you +to collect a royalty for further conveying from those to whom you convey +the Program, the only way you could satisfy both those terms and this +License would be to refrain entirely from conveying the Program. + + 13. Use with the GNU Affero General Public License. + + Notwithstanding any other provision of this License, you have +permission to link or combine any covered work with a work licensed +under version 3 of the GNU Affero General Public License into a single +combined work, and to convey the resulting work. The terms of this +License will continue to apply to the part which is the covered work, +but the special requirements of the GNU Affero General Public License, +section 13, concerning interaction through a network will apply to the +combination as such. + + 14. Revised Versions of this License. + + The Free Software Foundation may publish revised and/or new versions of +the GNU General Public License from time to time. Such new versions will +be similar in spirit to the present version, but may differ in detail to +address new problems or concerns. + + Each version is given a distinguishing version number. If the +Program specifies that a certain numbered version of the GNU General +Public License "or any later version" applies to it, you have the +option of following the terms and conditions either of that numbered +version or of any later version published by the Free Software +Foundation. If the Program does not specify a version number of the +GNU General Public License, you may choose any version ever published +by the Free Software Foundation. + + If the Program specifies that a proxy can decide which future +versions of the GNU General Public License can be used, that proxy's +public statement of acceptance of a version permanently authorizes you +to choose that version for the Program. + + Later license versions may give you additional or different +permissions. However, no additional obligations are imposed on any +author or copyright holder as a result of your choosing to follow a +later version. + + 15. Disclaimer of Warranty. + + THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY +APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT +HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY +OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, +THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM +IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF +ALL NECESSARY SERVICING, REPAIR OR CORRECTION. + + 16. Limitation of Liability. + + IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING +WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS +THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY +GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE +USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF +DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD +PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), +EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF +SUCH DAMAGES. + + 17. Interpretation of Sections 15 and 16. + + If the disclaimer of warranty and limitation of liability provided +above cannot be given local legal effect according to their terms, +reviewing courts shall apply local law that most closely approximates +an absolute waiver of all civil liability in connection with the +Program, unless a warranty or assumption of liability accompanies a +copy of the Program in return for a fee. + + END OF TERMS AND CONDITIONS + + How to Apply These Terms to Your New Programs + + If you develop a new program, and you want it to be of the greatest +possible use to the public, the best way to achieve this is to make it +free software which everyone can redistribute and change under these terms. + + To do so, attach the following notices to the program. It is safest +to attach them to the start of each source file to most effectively +state the exclusion of warranty; and each file should have at least +the "copyright" line and a pointer to where the full notice is found. + + + Copyright (C) + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . + +Also add information on how to contact you by electronic and paper mail. + + If the program does terminal interaction, make it output a short +notice like this when it starts in an interactive mode: + + Copyright (C) + This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'. + This is free software, and you are welcome to redistribute it + under certain conditions; type `show c' for details. + +The hypothetical commands `show w' and `show c' should show the appropriate +parts of the General Public License. Of course, your program's commands +might be different; for a GUI interface, you would use an "about box". + + You should also get your employer (if you work as a programmer) or school, +if any, to sign a "copyright disclaimer" for the program, if necessary. +For more information on this, and how to apply and follow the GNU GPL, see +. + + The GNU General Public License does not permit incorporating your program +into proprietary programs. If your program is a subroutine library, you +may consider it more useful to permit linking proprietary applications with +the library. If this is what you want to do, use the GNU Lesser General +Public License instead of this License. But first, please read +. diff --git a/README.md b/README.md new file mode 100644 index 0000000..d1c3553 --- /dev/null +++ b/README.md @@ -0,0 +1,41 @@ +# falmeida-py (Felipe Almeida python scripts) + +[![](https://anaconda.org/falmeida/falmeida-py/badges/installer/conda.svg)](https://anaconda.org/falmeida/falmeida-py) +[![](https://anaconda.org/falmeida/falmeida-py/badges/version.svg)](https://anaconda.org/falmeida/falmeida-py) +[![](https://anaconda.org/falmeida/falmeida-py/badges/platforms.svg)](https://anaconda.org/falmeida/falmeida-py) +[![Documentation Status](https://readthedocs.org/projects/falmeida-py/badge/?version=latest)](https://falmeida-py.readthedocs.io/en/latest/?badge=latest) + +This repository has been turned into an installable python package in order to facilitate the distribution of my custom python scripts and, in turn, make them easier to execute. + +Read the complete documentation » + +## Installation + +Installation is super easy and perhaps not required: + +```bash +# Download +git clone https://github.com/fmalmeida/pythonScripts.git +cd pythonScripts + +# Run without installing +python3 falmeida-py-runner.py -h + +# Or install with conda and run anywhere +conda install -c anaconda -c conda-forge -c bioconda -c falmeida falmeida-py + +# check installation +falmeida-py -h +``` + +## Old scripts + +All my python scripts as single scripts, that may or may not be included in the package are available in the `bkp` folder! + +## License + +This repository has no warranty and is free to use, modify and share under GNU GENERAL PUBLIC LICENSE version 3. + +## Contact + +Felipe Almeida diff --git a/Readme.md b/Readme.md deleted file mode 100644 index f9bf18d..0000000 --- a/Readme.md +++ /dev/null @@ -1,15 +0,0 @@ -# About - -This repository was created with the only reason to store my Python Scripts that shall eventually be used more than once. - -Feel free to comment some if you believe it can be improved also, feel free to use. - -# Contact - -## Name - -Felipe Marques de Almeida - -## Email - -falmeida@aluno.unb.br diff --git a/TEs_vs_shiu-pipeline.py b/bkp/TEs_vs_shiu-pipeline.py similarity index 100% rename from TEs_vs_shiu-pipeline.py rename to bkp/TEs_vs_shiu-pipeline.py diff --git a/bkp/ask_mongo.py b/bkp/ask_mongo.py new file mode 100644 index 0000000..a9782f9 --- /dev/null +++ b/bkp/ask_mongo.py @@ -0,0 +1,184 @@ +#!/usr/bin/env python3 +# coding: utf-8 + +## Def help message +""" +Ask mongo: This command allows you to ask/interrogate the mongo dbs in your machine + +usage: + ask-mongo.py [ -h|--help ] + ask-mongo.py [ --dbpath ] [ --list_dbs ] + ask-mongo.py [ --dbpath ] [ --db --list_collections ] + ask-mongo.py [ --dbpath ] [ --db --collection ] [ --overview ] [ --subfield --key --val ] + +options: + + -h, --help Show this screen + --dbpath= Where you have saved your dbs? Data directory of your mongo dbs. [Default: /data/db] + --list_dbs List available mongo dbs in your system + --db= Mongo DB to be queryed + --list_collections Check the available collections in a given database + --collection= Collection name (from mongo db) to be queryed + --overview Documents in a given collection (from a mongo db) + --subfield= Is your key/val inside a subfield (a nested document)? Give the name. + --key= Query a collection for documents based on which key? + --val= What to search in this key? + +example: +""" + +################################## +### Loading Necessary Packages ### +################################## +import sys +import os +import re +from docopt import docopt +import urllib.request, urllib.parse, urllib.error +import json +import pymongo +from pymongo import MongoClient +import pathlib +import pprint +import random +from bson.objectid import ObjectId + +######################## +### Useful functions ### +######################## +def start_mongod(db_path): + + try: + # Function to start mongo shell if not started yet + os.system(f"mongod --dbpath {db_path} --syslog --fork &> /dev/null") + except: + pass + +######################### +### Check your mongos ### +######################### +def check_mongos(): + + # Create connection + client = MongoClient() + + # Get dbs + dbs = client.list_database_names() + + # Print databases + print(f"\nThe available mondo dbs found in your system are: {dbs}\n") + + # Close client + client.close() + +####################################### +### Check collections in a database ### +####################################### +def check_db(db_name): + + # Create connection + client = MongoClient() + + # Open Database + db = client[db_name] + + # Check available collections + cols = db.list_collection_names() + print(f"\nAll the available collections found in the {db_name} database are given in the list: {cols}\n") + +############################# +### Overview a collection ### +############################# +def collection_overview(db_name, collection_name): + + # Create connection + client = MongoClient() + + # Open Database + db = client[db_name] + + # Open collection + collection = db[collection_name] + + # Count + n_docs = collection.count_documents({}) + + # Get keys + keys = [] + for doc in collection.find({}): + for key in iter(doc.keys()): + keys.append(key) + keys = list(dict.fromkeys(keys)) # remove duplicates + + # Get one as example + ids = [] + for doc in collection.find({}): + ids.append(doc['_id']) + example = collection.find_one({'_id': ObjectId(random.choice(ids))}) + + # Give overview + print(f"When analysing the collection {collection_name} in the database {db_name} we have found a total of {n_docs} documents (entries).") + print(f"These are the searcheable keys found in these documents: {keys}.") + print(f"They are searcheable and parseable based on its values.") + print(f"Here it is an example of one document of the collection:\n") + pprint.pprint(example) + +#################### +### Get Document ### +#################### +def get_doc(db_name, collection_name, key, val, subfield): + + # Create connection + client = MongoClient() + + # Open Database + db = client[db_name] + + # Open collection + collection = db[collection_name] + + # Find my document + if key == '_id': + results = collection.find({ key: ObjectId(val) }) + elif subfield != None: + results = collection.find({ f"{subfield}.{key}" : val }) + else: + results = collection.find({ key: val }) + + # Print + for doc in results: + pprint.pprint(doc) + +################ +### Def main ### +################ +if __name__ == '__main__': + args = docopt(__doc__, version='v1.0 by Felipe Marques de Almeida') + + # Start db (if not started) + start_mongod(db_path=args['--dbpath']) + + ## Help + if args['--help']: + print(args.strip()) + + ## List dbs + elif args['--list_dbs']: + check_mongos() + + ## List collections + elif args['--list_collections'] and args['--db']: + check_db(db_name=args['--db']) + + ## Overview collection + elif args['--collection'] and args['--db']: + if args['--overview']: + collection_overview(db_name=args['--db'], collection_name=args['--collection']) + elif args['--key'] and args['--val']: + get_doc(db_name=args['--db'], collection_name=args['--collection'], key=args['--key'], val=args['--val'], + subfield=args['--subfield']) + + ## None + else: + print("Missing argument!\n") + print(__doc__.strip()) diff --git a/detect_genes_with_pfam.py b/bkp/detect_genes_with_pfam.py similarity index 100% rename from detect_genes_with_pfam.py rename to bkp/detect_genes_with_pfam.py diff --git a/fetch_genomes_entrez.py b/bkp/fetch_genomes_entrez.py old mode 100755 new mode 100644 similarity index 100% rename from fetch_genomes_entrez.py rename to bkp/fetch_genomes_entrez.py diff --git a/filter_gff.py b/bkp/filter_gff.py similarity index 100% rename from filter_gff.py rename to bkp/filter_gff.py diff --git a/gff2json.py b/bkp/gff2json.py old mode 100755 new mode 100644 similarity index 100% rename from gff2json.py rename to bkp/gff2json.py diff --git a/make_UpSetR_from_tsv.py b/bkp/make_UpSetR_from_tsv.py old mode 100755 new mode 100644 similarity index 100% rename from make_UpSetR_from_tsv.py rename to bkp/make_UpSetR_from_tsv.py diff --git a/mongoDB_parse_JSON.py b/bkp/mongoDB_parse_JSON.py old mode 100755 new mode 100644 similarity index 100% rename from mongoDB_parse_JSON.py rename to bkp/mongoDB_parse_JSON.py diff --git a/parse_cardDB.py b/bkp/parse_cardDB.py old mode 100755 new mode 100644 similarity index 100% rename from parse_cardDB.py rename to bkp/parse_cardDB.py diff --git a/parse_resfamsDB.py b/bkp/parse_resfamsDB.py old mode 100755 new mode 100644 similarity index 100% rename from parse_resfamsDB.py rename to bkp/parse_resfamsDB.py diff --git a/plot_dna_features.py b/bkp/plot_dna_features.py similarity index 100% rename from plot_dna_features.py rename to bkp/plot_dna_features.py diff --git a/resources/Kp31_card.tsv b/bkp/resources/Kp31_card.tsv old mode 100755 new mode 100644 similarity index 100% rename from resources/Kp31_card.tsv rename to bkp/resources/Kp31_card.tsv diff --git a/resources/Kp31_resistance.tsv b/bkp/resources/Kp31_resistance.tsv old mode 100755 new mode 100644 similarity index 100% rename from resources/Kp31_resistance.tsv rename to bkp/resources/Kp31_resistance.tsv diff --git a/resources/Resfams_metadata.xlsx b/bkp/resources/Resfams_metadata.xlsx similarity index 100% rename from resources/Resfams_metadata.xlsx rename to bkp/resources/Resfams_metadata.xlsx diff --git a/resources/aro_categories_index.tsv b/bkp/resources/aro_categories_index.tsv similarity index 100% rename from resources/aro_categories_index.tsv rename to bkp/resources/aro_categories_index.tsv diff --git a/resources/aro_index.tsv b/bkp/resources/aro_index.tsv similarity index 100% rename from resources/aro_index.tsv rename to bkp/resources/aro_index.tsv diff --git a/rgi2gff.py b/bkp/rgi2gff.py old mode 100755 new mode 100644 similarity index 100% rename from rgi2gff.py rename to bkp/rgi2gff.py diff --git a/run_blasts.py b/bkp/run_blasts.py similarity index 100% rename from run_blasts.py rename to bkp/run_blasts.py diff --git a/bkp/splitgbk2fasta.py b/bkp/splitgbk2fasta.py new file mode 100644 index 0000000..aa94ec0 --- /dev/null +++ b/bkp/splitgbk2fasta.py @@ -0,0 +1,121 @@ +#!/usr/bin/env python3 +# coding: utf-8 + +######################## +### Def help message ### +######################## +""" +A script meant to subset a genbank annotation file based on alignments +against a query FASTA file + +--- +Copyright (C) 2020 Felipe Marques de Almeida (almeidafmarques@gmail.com) +License: Public Domain + +Usage: + splitgbk2fasta.py + splitgbk2fasta.py -h|--help + splitgbk2fasta.py -v|--version + splitgbk2fasta.py ( --gbk --fasta --out ) [ --minid --mincov --culling_limit ] + +Options: + -h --help Show this screen. + -v --version Show version information + -g --gbk= Gbk file for subset + -f --fasta= FASTA file for querying the gbk + -o --out= Gbk filtered output file + --minid= Min. Identity percentage for gene annotation [default: 80] + --mincov= Min. Covereage for gene annotation [default: 80] + --culling_limit= Blast culling_limit for best hit only [default: 1] +""" + +################################## +### Loading Necessary Packages ### +################################## +from docopt import docopt +from Bio import SeqIO +from io import StringIO +import pandas as pd +import os +import sys + +########################################## +### Function to convert gbk into fasta ### +########################################## +def gbk2fasta(gbk): + f=open("tmp_gbk.fa", "a") + for seq_record in SeqIO.parse(gbk, 'genbank'): + for seq_feature in seq_record.features: + if seq_feature.type=="CDS": + if 'translation' in seq_feature.qualifiers: + print(f">{seq_feature.qualifiers['locus_tag'][0]}", file=f) + print(f"{seq_feature.qualifiers['translation'][0]}", file=f) + else: + start = seq_feature.location.nofuzzy_start + end = seq_feature.location.nofuzzy_end + tl_table = seq_feature.qualifiers['transl_table'][0] + if str(seq_feature.strand) == '-1': + my_seq = seq_record.seq[start:end].reverse_complement().translate(table=tl_table) + else: + my_seq = seq_record.seq[start:end].translate(table=tl_table) + seq_feature.qualifiers['translation'] = str(my_seq) + print(f">{seq_feature.qualifiers['locus_tag'][0]}", file=f) + print(f"{seq_feature.qualifiers['translation'][0]}", file=f) + +############################################################## +### Function to run blast and detect locus_tags that match ### +############################################################## +def blastgbk(fasta, culling, minid, mincov): + # Outfmt + outfmt="6 qseqid qstart qend qlen sseqid sstart send slen evalue length pident gaps gapopen stitle" + + # Run blast + os.system(f"makeblastdb -dbtype nucl -in {fasta} -out query_db") + os.system(f"tblastn -db query_db -query tmp_gbk.fa -outfmt \"{outfmt}\" -culling_limit {culling} | \ + awk -v minid={minid} -v mincov={mincov} '{{ if ($11 >= minid && (($10 - $12) / $4 * 100) >= mincov) {{print $0}} }}' > out.blast") + +############################################ +### Function to filter gbk based on hits ### +############################################ +def filtergbk(gbk, out): + + f=open(f"{out}", "w") + # Read blast results + blast_res = pd.read_csv('out.blast', sep = '\t', header=None) + + # Get locus_tags + sel_locus = blast_res.iloc[:, 0].tolist() + + # Subset + for seq_record in SeqIO.parse(gbk, 'genbank'): + + cds = [feat for feat in seq_record.features if feat.type == 'CDS'] + filtered = [ft for ft in cds if ft.qualifiers['locus_tag'][0] in sel_locus] + seq_record.features = [i for i in filtered] + if len(seq_record.features) > 0: + SeqIO.write(seq_record, f, 'gb') + +############ +### Main ### +############ +if __name__ == '__main__': + arguments = docopt(__doc__, version='v1.0 by Felipe Marques de Almeida') + + ## Run pipeline + if arguments['--gbk'] and arguments['--fasta']: + + # Run + gbk2fasta(gbk=arguments['--gbk']) + blastgbk(fasta=arguments['--fasta'], culling=arguments['--culling_limit'], + minid=arguments['--minid'], mincov=arguments['--mincov']) + filtergbk(gbk=arguments['--gbk'], out=arguments['--out']) + + # Clean dir + os.system(f"rm tmp_gbk.fa out.blast query_db.n*") + + ## None + else: + print("Missing mandatory arguments") + print("Please, check out the help message") + print("") + print(arguments) diff --git a/splitgenbank.py b/bkp/splitgenbank.py similarity index 100% rename from splitgenbank.py rename to bkp/splitgenbank.py diff --git a/teste.json b/bkp/teste.json similarity index 100% rename from teste.json rename to bkp/teste.json diff --git a/tgfam_vs_shiu-pipeline.py b/bkp/tgfam_vs_shiu-pipeline.py similarity index 100% rename from tgfam_vs_shiu-pipeline.py rename to bkp/tgfam_vs_shiu-pipeline.py diff --git a/build_conda.sh b/build_conda.sh new file mode 100644 index 0000000..d3d0571 --- /dev/null +++ b/build_conda.sh @@ -0,0 +1,19 @@ +# make dir +mkdir -p build + +# build conda +conda build --output-folder build/ conda.recipe + +# convert to osx +conda convert -p osx-64 $(find build -name "falmeida-py*.tar.bz2") + +# upload osx +anaconda upload $(find osx-64 -name "falmeida-py*.tar.bz2") --force +sleep 140 +anaconda upload $(find build/linux-64 -name "falmeida-py*.tar.bz2") --force + +# rm dirs +rm -rf build osx-64 + +# save new help message +bash build_helps.sh diff --git a/build_helps.sh b/build_helps.sh new file mode 100644 index 0000000..e4f14bf --- /dev/null +++ b/build_helps.sh @@ -0,0 +1,5 @@ +# save new help message +python3 falmeida-py-runner.py -h &> docs/help_message.txt +python3 falmeida-py-runner.py tsv2markdown -h &> docs/tsv2markdown_help.txt +python3 falmeida-py-runner.py splitgbk -h &> docs/splitgbk_help.txt +python3 falmeida-py-runner.py align2subsetgbk -h &> docs/align2subsetgbk_help.txt \ No newline at end of file diff --git a/conda.recipe/meta.yaml b/conda.recipe/meta.yaml new file mode 100644 index 0000000..b1e862a --- /dev/null +++ b/conda.recipe/meta.yaml @@ -0,0 +1,46 @@ +package: + name: falmeida-py + version: '1.2.4' + +source: + path: .. + +build: + number: 0 + script: pip install . + entry_points: + - processrcmfolder = uconnrcmpy.dataprocessing:process_folder + + +requirements: + build: + - python>=3.8 + - setuptools + - setuptools-git + - pandas + - tabulate + - docopt + - biopython + - simplejson + - importlib_metadata + - pyyaml + + run: + - python>=3.8 + - setuptools + - setuptools-git + - pandas + - tabulate + - docopt + - biopython + - simplejson + - importlib_metadata + - pyyaml + +channels: + - anaconda + - conda-forge + - bioconda + +about: + home: https://github.com/fmalmeida/gff-toolbox diff --git a/docs/Makefile b/docs/Makefile new file mode 100644 index 0000000..d4bb2cb --- /dev/null +++ b/docs/Makefile @@ -0,0 +1,20 @@ +# Minimal makefile for Sphinx documentation +# + +# You can set these variables from the command line, and also +# from the environment for the first two. +SPHINXOPTS ?= +SPHINXBUILD ?= sphinx-build +SOURCEDIR = . +BUILDDIR = _build + +# Put it first so that "make" without argument is like "make help". +help: + @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) + +.PHONY: help Makefile + +# Catch-all target: route all unknown targets to Sphinx using the new +# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). +%: Makefile + @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) diff --git a/docs/_static/style.css b/docs/_static/style.css new file mode 100644 index 0000000..a3df43e --- /dev/null +++ b/docs/_static/style.css @@ -0,0 +1,6 @@ +.wy-nav-content { + max-width: 100% !important; +} +p { + text-align:justify; +} diff --git a/docs/align2subsetgbk.rst b/docs/align2subsetgbk.rst new file mode 100644 index 0000000..06b0cac --- /dev/null +++ b/docs/align2subsetgbk.rst @@ -0,0 +1,45 @@ +.. _align2subsetgbk: + +align2subsetgbk +=============== + +This script is a little more tricky. What does it do? + +1. It takes a genbank file and a (Nucleotide) fasta file +2. It will align the fasta to the (contig) sequences in the genbank file +3. Then, it will filter the alignments based on the given thresholds and take the its coordinates +4. Finally, using these coordinates it will filter the genbank file and output only a subset of the annotation features that is found inside these alignments + +CLI help message +---------------- + +.. literalinclude:: ./align2subsetgbk_help.txt + :language: stdout + +Usage +----- + +The usage is super simple: + +.. code-block:: none + + falmeida-py align2subsetgbk \ + --gbk sample.gbk \ + --fasta genomic_islands.fna \ + --extension 50 \ + --culling_limit 2 + + Processing file: annotation/sample.gbk! + qseqid qstart qend qlen sseqid sstart send slen evalue length pident gaps gapopen bitscore + 0 NC_016845.1_1287077_1326610 1 39533 39533 NC_016845.1 1287078 1326610 5333942 0.0 39533 100.0 0 0 73004 + 1 NC_016845.1_1778390_1811349 1 32959 32959 NC_016845.1 1778391 1811349 5333942 0.0 32959 100.0 0 0 60864 + 2 NC_016845.1_2286181_2305760 1 19579 19579 NC_016845.1 2286182 2305760 5333942 0.0 19579 100.0 0 0 36156 + 3 NC_016845.1_4049987_4083106 1 33119 33119 NC_016845.1 4049988 4083106 5333942 0.0 33119 100.0 0 0 61160 + 4 NC_016845.1_4821157_4858652 1 37495 37495 NC_016845.1 4821158 4858652 5333942 0.0 37495 100.0 0 0 69241 + + +.. note:: + + This command will align the genomic islands to the genbank to find the annotation features that are found inside the regions that align to these genomic islands. + + In the example, the alignments coordinates will be "extendend" 50nt in both directions and, the two best alignments will be used for each sequence (from fasta). \ No newline at end of file diff --git a/docs/align2subsetgbk_help.txt b/docs/align2subsetgbk_help.txt new file mode 100644 index 0000000..769c2b2 --- /dev/null +++ b/docs/align2subsetgbk_help.txt @@ -0,0 +1,44 @@ +A script meant to subset a genbank annotation file based on alignments against a query (Nucleotide) FASTA file + +--- +Copyright (C) 2020 Felipe Marques de Almeida (almeidafmarques@gmail.com) +License: Public Domain + +Usage: + falmeida-py align2subsetgbk [ -h|--help ] + falmeida-py align2subsetgbk [ --gbk --fasta --out --minid --mincov --culling_limit --extension ] + +Options: + -h --help Show this screen. + -g --gbk= Gbk file for subset + -f --fasta= FASTA (nucl) file for querying the gbk + -o --out= Gbk filtered output file [Default: out.gbk]. + --extension= Base pair length to extend the flank regions in the alignment [Default: 0]. + --minid= Min. Identity percentage for gene annotation [Default: 80]. + --mincov= Min. Covereage for gene annotation [Default: 80]. + --culling_limit= Blast culling_limit for best hit only [Default: 1]. +falmeida-py: a package to the simple distribution of my custom scripts. + +Copyright (C) 2020 Felipe Marques de Almeida (almeidafmarques@gmail.com) +License: Public Domain + +usage: + falmeida-py [ -h|--help ] [ -v|--version ] [ --license ] + falmeida-py [ -h|--help ] [ ... ] + +options: + -h --help Show this screen + -v --version Show version information + --license Show LEGAL LICENSE information + +commands: + tsv2markdown Command for rapid convertion of tsv or csv to markdown tables. + splitgbk Command to split multisequence genbank files into individual files. + align2subsetgbk Command to subset genbank files based on alignments to a FASTA file. + gbk2fasta Command to convert genbank files to fasta files. + blasts Command to execute automatized blast commands. + replace_fasta_seq Command to replace strings in a FASTA using defitinitions from a BED file + mpgap2csv Command to summarize main mpgap multiqc assembly statistics into a CSV file + bacannot2json Command to summarize main bacannot annotation results into JSON file + +Use: `falmeida-py -h` to get more help and see examples. diff --git a/docs/conf.py b/docs/conf.py new file mode 100644 index 0000000..52a2469 --- /dev/null +++ b/docs/conf.py @@ -0,0 +1,115 @@ +# Configuration file for the Sphinx documentation builder. +# +# This file only contains a selection of the most common options. For a full +# list see the documentation: +# https://www.sphinx-doc.org/en/master/usage/configuration.html + +# -- Path setup -------------------------------------------------------------- + +# If extensions (or modules to document with autodoc) are in another directory, +# add these directories to sys.path here. If the directory is relative to the +# documentation root, use os.path.abspath to make it absolute, like shown here. +# +# import os +# import sys +# sys.path.insert(0, os.path.abspath('.')) + + +# -- Project information ----------------------------------------------------- + +project = 'falmeida-py' +copyright = '2020, Felipe, Almeida. falmeida-py: A simple package to index my frequently used python scripts.' +author = 'Felipe Marques de Almeida' + + +# -- General configuration --------------------------------------------------- +# The suffix(es) of source filenames. +# You can specify multiple suffix as a list of string: +# +# source_suffix = ['.rst', '.md'] +source_suffix = '.rst' + +# The master toctree document. +master_doc = 'index' + +# Add any Sphinx extension module names here, as strings. They can be +# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom +# ones. +extensions = [ + "sphinx.ext.intersphinx", + "sphinx.ext.autodoc", + "sphinx.ext.mathjax", + "sphinx.ext.viewcode", + "sphinx_copybutton" +] + +# Add any paths that contain templates here, relative to this directory. +templates_path = ['_templates'] + +# List of patterns, relative to source directory, that match files and +# directories to ignore when looking for source files. +# This pattern also affects html_static_path and html_extra_path. +exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] + + +# -- Options for HTML output ------------------------------------------------- + +# The theme to use for HTML and HTML Help pages. See the documentation for +# a list of builtin themes. +# + +# --- default --- +#import sphinx_rtd_theme +#html_theme = 'sphinx_rtd_theme' + +# --- material --- +#import sphinx_materialdesign_theme +#html_theme = "sphinx_materialdesign_theme" + +# --- jupyter --- +#import jupyter_sphinx_theme +#html_theme = "jupyter" +#html_sidebars = {'**': ['sidebartoc.html']} +#html_theme_path = jupyter_sphinx_theme.get_html_theme_path() + +# --- karma --- +#html_theme = "karma_sphinx_theme" + +# --- material --- +html_theme = "sphinx_material" +# Material theme options (see theme.conf for more information) +html_theme_options = { + + # Set the color and the accent color + # Primary color. Options are red, pink, purple, deep-purple, indigo, blue, light-blue, cyan, teal, green, light-green, lime, yellow, amber, orange, deep-orange, brown, grey, blue-grey, and white. + 'color_primary': 'indigo', + #Accent color. Options are red, pink, purple, deep-purple, indigo, blue, light-blue, cyan, teal, green, light-green, lime, yellow, amber, orange, and deep-orange. + 'color_accent': 'amber', + + # Visible levels of the global TOC; -1 means unlimited + 'globaltoc_depth': 1, + # If False, expand all TOC entries + 'globaltoc_collapse': True, + # If True, show hidden TOC entries + 'globaltoc_includehidden': True, + + # logo + 'logo_icon': "book", + + # repo info + "repo_url": "https://github.com/fmalmeida/pythonScripts", + "repo_name": "falmeida-py", + "repo_type": "github", +} +html_sidebars = { + "**": ["globaltoc.html", "localtoc.html", "searchbox.html"] +} + +# --- custom css --- +html_title = project +html_static_path = ['_static'] +html_css_files = ['style.css'] + +# Copy button configuration -------------------------------------------------- +# Source: https://sphinx-copybutton.readthedocs.io/en/latest/ +#copybutton_prompt_text = "$ " diff --git a/docs/help_message.txt b/docs/help_message.txt new file mode 100644 index 0000000..6ed5318 --- /dev/null +++ b/docs/help_message.txt @@ -0,0 +1,25 @@ +falmeida-py: a package to the simple distribution of my custom scripts. + +Copyright (C) 2020 Felipe Marques de Almeida (almeidafmarques@gmail.com) +License: Public Domain + +usage: + falmeida-py [ -h|--help ] [ -v|--version ] [ --license ] + falmeida-py [ -h|--help ] [ ... ] + +options: + -h --help Show this screen + -v --version Show version information + --license Show LEGAL LICENSE information + +commands: + tsv2markdown Command for rapid convertion of tsv or csv to markdown tables. + splitgbk Command to split multisequence genbank files into individual files. + align2subsetgbk Command to subset genbank files based on alignments to a FASTA file. + gbk2fasta Command to convert genbank files to fasta files. + blasts Command to execute automatized blast commands. + replace_fasta_seq Command to replace strings in a FASTA using defitinitions from a BED file + mpgap2csv Command to summarize main mpgap multiqc assembly statistics into a CSV file + bacannot2json Command to summarize main bacannot annotation results into JSON file + +Use: `falmeida-py -h` to get more help and see examples. diff --git a/docs/images/example.svg b/docs/images/example.svg new file mode 100644 index 0000000..09c2b8e --- /dev/null +++ b/docs/images/example.svg @@ -0,0 +1,2841 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/docs/index.rst b/docs/index.rst new file mode 100644 index 0000000..28003ec --- /dev/null +++ b/docs/index.rst @@ -0,0 +1,50 @@ +.. _index: + +falmeida-py (Felipe Almeida python scripts) +=========================================== + +This is just a simple repository of python scripts that I use in a daily basis. Some of them are used in my projects, some are just for fun. Thus, in order to provide a better way to use my scripts throughout machines, I decided to create a package with them. + +Installation +------------ + +Conda package +""""""""""""" + +Users are advised to create separate conda environments + +.. code-block:: bash + + # Get the conda package + mamba create -n falmeida-py -c anaconda -c conda-forge -c bioconda -c falmeida falmeida-py + +.. tip:: + + Users are advised to use `mamba `_ since it is faster. + +Available features +------------------ + +Available features can be visualised with the command line help message: + +.. literalinclude:: ./help_message.txt + :language: stdout + + +.. note:: + + All the commands will have a page describing its usage. + +.. toctree:: + :hidden: + :caption: Home + + self + +.. toctree:: + :hidden: + :caption: Reference book + + tsv2markdown + splitgbk + align2subsetgbk diff --git a/docs/requirements.txt b/docs/requirements.txt new file mode 100644 index 0000000..9b03960 --- /dev/null +++ b/docs/requirements.txt @@ -0,0 +1,9 @@ +Sphinx +sphinxcontrib-applehelp +sphinxcontrib-devhelp +sphinxcontrib-htmlhelp +sphinxcontrib-jsmath +sphinxcontrib-qthelp +sphinxcontrib-serializinghtml +sphinx-material +sphinx-copybutton diff --git a/docs/splitgbk.rst b/docs/splitgbk.rst new file mode 100644 index 0000000..c2ba08d --- /dev/null +++ b/docs/splitgbk.rst @@ -0,0 +1,27 @@ +.. _splitgbk: + +splitgbk +======== + +This is a super simple script that performs only one task: + +* Given a genbank file with multiple sequences, it splits it into multiple files, each one with one sequence and its related annotation features. + +CLI help message +---------------- + +.. literalinclude:: ./splitgbk_help.txt + :language: stdout + +Usage +----- + +The usage is super simple: + +.. code-block:: none + + falmeida-py tsv2markdown --gbk multi_contig.gbk --outdir single_contig_gbks + +.. note:: + + The directory must already exist. \ No newline at end of file diff --git a/docs/splitgbk_help.txt b/docs/splitgbk_help.txt new file mode 100644 index 0000000..f139033 --- /dev/null +++ b/docs/splitgbk_help.txt @@ -0,0 +1,38 @@ +A very simple script to split multisequence genbank files into separate files +Copyright (C) 2020 Felipe Marques de Almeida (almeidafmarques@gmail.com) +License: Public Domain + +usage: + falmeida-py splitgbk + falmeida-py splitgbk [ -h|--help ] + falmeida-py splitgbk [ --gbk ] [ --outdir ] + +options: + -h --help Show this screen. + -g --gbk= Input genbank file to split into multiple individual files. + -o --outdir= Directory (must already exist) in which to write the splitted files [Default: ./]. +falmeida-py: a package to the simple distribution of my custom scripts. + +Copyright (C) 2020 Felipe Marques de Almeida (almeidafmarques@gmail.com) +License: Public Domain + +usage: + falmeida-py [ -h|--help ] [ -v|--version ] [ --license ] + falmeida-py [ -h|--help ] [ ... ] + +options: + -h --help Show this screen + -v --version Show version information + --license Show LEGAL LICENSE information + +commands: + tsv2markdown Command for rapid convertion of tsv or csv to markdown tables. + splitgbk Command to split multisequence genbank files into individual files. + align2subsetgbk Command to subset genbank files based on alignments to a FASTA file. + gbk2fasta Command to convert genbank files to fasta files. + blasts Command to execute automatized blast commands. + replace_fasta_seq Command to replace strings in a FASTA using defitinitions from a BED file + mpgap2csv Command to summarize main mpgap multiqc assembly statistics into a CSV file + bacannot2json Command to summarize main bacannot annotation results into JSON file + +Use: `falmeida-py -h` to get more help and see examples. diff --git a/docs/tsv2markdown.rst b/docs/tsv2markdown.rst new file mode 100644 index 0000000..3707ffe --- /dev/null +++ b/docs/tsv2markdown.rst @@ -0,0 +1,39 @@ +.. _tsv2markdown: + +tsv2markdown +============ + +Often I've found myself having to add the contents of TSV (or CSV) files in markdown documents such as jupyter notebooks or rmarkdown documents. The process is very tedious, so I created a super simple script to output the contents so it could just be pasted into these documents. + +CLI help message +---------------- + +.. literalinclude:: ./tsv2markdown_help.txt + :language: stdout + +Usage +----- + +The usage is super simple. For example, given the following CSV: + +.. code-block:: none + + sample1,brazil,river,2020 + sample2,argentina,sewer,2020 + sample3,brazil,sewer,2020 + +We could rapidply create a markdown table entry with: + +.. code-block:: none + + falmeida-py tsv2markdown --csv sample.csv --header "sample id,country,collection source,year" + + | sample id | country | collection source | year | + |-------------|-----------|---------------------|--------| + | sample1 | brazil | river | 2020 | + | sample2 | argentina | sewer | 2020 | + | sample3 | brazil | sewer | 2020 | + +.. note:: + + If the first line of the document (TSV or CSV) is a header, will to not need to use the ``--header`` option, because it will use the frist line as the header. \ No newline at end of file diff --git a/docs/tsv2markdown_help.txt b/docs/tsv2markdown_help.txt new file mode 100644 index 0000000..0b2b685 --- /dev/null +++ b/docs/tsv2markdown_help.txt @@ -0,0 +1,40 @@ +A simple script to convert tsv (or csv) files to markdown tables using tabulate! +Copyright (C) 2020 Felipe Marques de Almeida (almeidafmarques@gmail.com) +License: Public Domain + +usage: + falmeida-py tsv2markdown + falmeida-py tsv2markdown [ -h|--help ] + falmeida-py tsv2markdown [ --tsv --csv --header ] + +options: + -h --help Show this screen. + --tsv= Input tsv file to print as markdown table + --csv= Input csv file to print as markdown table + --header= If file does not have a header, set a + custom header. E.g. --header "Planet,R (km),mass (x 10^29 kg)". +falmeida-py: a package to the simple distribution of my custom scripts. + +Copyright (C) 2020 Felipe Marques de Almeida (almeidafmarques@gmail.com) +License: Public Domain + +usage: + falmeida-py [ -h|--help ] [ -v|--version ] [ --license ] + falmeida-py [ -h|--help ] [ ... ] + +options: + -h --help Show this screen + -v --version Show version information + --license Show LEGAL LICENSE information + +commands: + tsv2markdown Command for rapid convertion of tsv or csv to markdown tables. + splitgbk Command to split multisequence genbank files into individual files. + align2subsetgbk Command to subset genbank files based on alignments to a FASTA file. + gbk2fasta Command to convert genbank files to fasta files. + blasts Command to execute automatized blast commands. + replace_fasta_seq Command to replace strings in a FASTA using defitinitions from a BED file + mpgap2csv Command to summarize main mpgap multiqc assembly statistics into a CSV file + bacannot2json Command to summarize main bacannot annotation results into JSON file + +Use: `falmeida-py -h` to get more help and see examples. diff --git a/falmeida-py-runner.py b/falmeida-py-runner.py new file mode 100755 index 0000000..90dbc71 --- /dev/null +++ b/falmeida-py-runner.py @@ -0,0 +1,22 @@ +#!/usr/bin/env python3 +""" +This script allows a user to run fa-py without a pip/setuptools installation. Assuming all +dependencies are installed, they can simply clone the repo and run this script. + +Copyright 2020 Felipe Almeida (almeidafmarques@gmail.com) +https://github.com/fmalmeida/pythonScripts + +This file is part of my custom python scripts (fa-py) package, which is free: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by the Free Software Foundation, +either version 3 of the License, or (at your option) any later version. This package is distributed +in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more +details. You should have received a copy of the GNU General Public License along with fa-py package. +If not, see . +""" + +from falmeida_py.__main__ import main + + +if __name__ == '__main__': + main() diff --git a/falmeida_py/__init__.py b/falmeida_py/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/falmeida_py/__main__.py b/falmeida_py/__main__.py new file mode 100644 index 0000000..4b94bba --- /dev/null +++ b/falmeida_py/__main__.py @@ -0,0 +1,249 @@ +#!/usr/bin/env python3 +license=""" +Copyright 2021 Felipe Almeida (almeidafmarques@gmail.com) +https://github.com/fmalmeida/pythonScripts + +This file is part of my custom python scripts (falmeida-py) package, which is free: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by the Free Software Foundation, +either version 3 of the License, or (at your option) any later version. This package is distributed +in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more +details. You should have received a copy of the GNU General Public License along with falmeida-py package. +If not, see . +""" + +## Def main help +usage=""" +falmeida-py: a package to the simple distribution of my custom scripts. + +Copyright (C) 2020 Felipe Marques de Almeida (almeidafmarques@gmail.com) +License: Public Domain + +usage: + falmeida-py [ -h|--help ] [ -v|--version ] [ --license ] + falmeida-py [ -h|--help ] [ ... ] + +options: + -h --help Show this screen + -v --version Show version information + --license Show LEGAL LICENSE information + +commands: + tsv2markdown Command for rapid convertion of tsv or csv to markdown tables. + splitgbk Command to split multisequence genbank files into individual files. + align2subsetgbk Command to subset genbank files based on alignments to a FASTA file. + gbk2fasta Command to convert genbank files to fasta files. + blasts Command to execute automatized blast commands. + replace_fasta_seq Command to replace strings in a FASTA using defitinitions from a BED file + mpgap2csv Command to summarize main mpgap multiqc assembly statistics into a CSV file + bacannot2json Command to summarize main bacannot annotation results into JSON file + +Use: `falmeida-py -h` to get more help and see examples. +""" + +################################## +### Loading Necessary Packages ### +################################## +from docopt import docopt + +######################## +### Import functions ### +######################## +from .version import * +from .tsv2markdown import * +from .splitgbk import * +from .gbk2fasta import * +from .blasts import * +from .align2subsetgbk import * +from .replace_fasta_seq import * +from .bacannot2json import usage_bacannot2json,bacannot2json +from .mpgap2csv import usage_mpgap2csv,mpgap2csv + +## Defining main +def main(): + # Parse docopt + __version__ = get_version() + arguments = docopt(usage, version=__version__, help=False, options_first=True) + + ############################ + ### tsv2markdown command ### + ############################ + if arguments[''] == 'tsv2markdown': + + # Parse docopt + args = docopt(usage_tsv2markdown, version=__version__, help=False) + + # run script + if args['--help']: + print(usage_tsv2markdown.strip()) + + elif args['--tsv']: + file2mw(args['--tsv'], '\t', args['--header']) + + elif args['--csv']: + file2mw(args['--csv'], ',', args['--header']) + + else: + print(usage_tsv2markdown.strip()) + + ######################### + ### Split gbk command ### + ######################### + elif arguments[''] == 'splitgbk': + # Parse docopt + args = docopt(usage_splitgbk, version=__version__, help=False) + + # Run + if args['--help']: + print(usage_splitgbk.strip()) + + elif args['--gbk']: + splitgbk(args['--gbk'], args['--outdir']) + + else: + print(usage_splitgbk.strip()) + + ###################### + ### Blast commands ### + ###################### + elif arguments[''] == 'blasts': + # Parse docopt + args = docopt(usage_blasts, version=__version__, help=False) + + # Run + if args['--help']: + print(usage_blasts.strip()) + + elif args['--query'] and args['--subject']: + + ## check if task is correct + if args['--task'].lower() in ['blastn', 'tblastn', 'blastp', 'blastx']: + ## run blast + blast(task=args['--task'].lower(), query=args['--query'], + subject=args['--subject'], culling=args['--culling_limit'], + minid=args['--minid'], mincov=args['--mincov'], out=args['--out'], + threads=args['--threads'], twoway=args['--2way']) + ## summary + summary(output=args['--out']) + else: + print(f"PROBLEM!\nI could not understand the task \"{args['--task'].lower()}\", please select one of: blastn, tblastn, blastp or blastx.") + + else: + print(usage_blasts.strip()) + + ###################################### + ### Subset gbk with fasta commands ### + ###################################### + elif arguments[''] == 'align2subsetgbk': + # Parse docopt + args = docopt(usage_align2subsetgbk, version=__version__, help=False) + + # Run + if args['--help']: + print(usage_align2subsetgbk.strip()) + + elif args['--gbk'] and args['--fasta']: + + # Run + print(f"Processing file: {args['--gbk']}!") + gbk2fasta(gbk=args['--gbk']) + blast(task='blastn', query=args['--fasta'], subject='tmp_gbk.fa', + culling=args['--culling_limit'], minid=args['--minid'], mincov=args['--mincov'], + out='out.blast', threads=1, twoway=None) + filtergbk(gbk=args['--gbk'], out=args['--out'], extension=int(args['--extension'])) + + # Clean dir + os.system(f"rm -rf tmp_gbk.fa out.blast") + + else: + print(usage_align2subsetgbk.strip()) + + ################################### + ### Replace fasta seq using bed ### + ################################### + elif arguments[''] == 'replace_fasta_seq': + # Parse docopt + args = docopt(usage_replace_fasta_seq, version=__version__, help=False) + + # Run + if args['--help']: + print(usage_replace_fasta_seq.strip()) + + elif args['--fasta'] and args['--bed']: + + # Run + print(f"Processing file: {args['--fasta']}!") + replace_fasta_seq(input=args['--fasta'], bed=args['--bed'], output=args['--out']) + print(f"Done!") + + else: + print(usage_replace_fasta_seq.strip()) + + ########################### + ### Convert gbk 2 fasta ### + ########################### + elif arguments[''] == 'gbk2fasta': + # Parse docopt + args = docopt(usage_gbk2fasta, version=__version__, help=False) + + # run script + if args['--help']: + print(usage_gbk2fasta.strip()) + + elif args['--gbk']: + convertgbk(genbank=args['--gbk'], genes_list=args['--fofn']) + + else: + print(usage_gbk2fasta.strip()) + + ############################# + ### bacannot2json command ### + ############################# + if arguments[''] == 'bacannot2json': + + # Parse docopt + args = docopt(usage_bacannot2json, version=__version__, help=False) + + # run script + if args['--help']: + print(usage_bacannot2json.strip()) + + elif args['--input']: + bacannot2json(args['--input'], args['--output'], args['--print']) + + else: + print(usage_bacannot2json.strip()) + + ######################### + ### mpgap2csv command ### + ######################### + elif arguments[''] == 'mpgap2csv': + + # Parse docopt + args = docopt(usage_mpgap2csv, version=__version__, help=False) + + # run script + if args['--help']: + print(usage_mpgap2csv.strip()) + + elif args['--input']: + mpgap2csv(args['--input'], args['--output']) + + else: + print(usage_mpgap2csv.strip()) + + ##################### + ### Check license ### + ##################### + elif arguments['--license']: + print(license.strip()) + + ####################################### + ### Without commands nor parameters ### + ####################################### + elif not arguments['']: + print(usage.strip()) + +## Calling main +if __name__ == '__main__': + main() diff --git a/falmeida_py/align2subsetgbk.py b/falmeida_py/align2subsetgbk.py new file mode 100644 index 0000000..29fcec8 --- /dev/null +++ b/falmeida_py/align2subsetgbk.py @@ -0,0 +1,80 @@ +#!/usr/bin/env python3 +# coding: utf-8 + +######################## +### Def help message ### +######################## +usage_align2subsetgbk = """ +A script meant to subset a genbank annotation file based on alignments against a query (Nucleotide) FASTA file + +--- +Copyright (C) 2020 Felipe Marques de Almeida (almeidafmarques@gmail.com) +License: Public Domain + +Usage: + falmeida-py align2subsetgbk [ -h|--help ] + falmeida-py align2subsetgbk [ --gbk --fasta --out --minid --mincov --culling_limit --extension ] + +Options: + -h --help Show this screen. + -g --gbk= Gbk file for subset + -f --fasta= FASTA (nucl) file for querying the gbk + -o --out= Gbk filtered output file [Default: out.gbk]. + --extension= Base pair length to extend the flank regions in the alignment [Default: 0]. + --minid= Min. Identity percentage for gene annotation [Default: 80]. + --mincov= Min. Covereage for gene annotation [Default: 80]. + --culling_limit= Blast culling_limit for best hit only [Default: 1]. +""" + +################################## +### Loading Necessary Packages ### +################################## +from docopt import docopt +from Bio import SeqIO +import pandas as pd +from .blasts import * + +########################################## +### Function to convert gbk into fasta ### +########################################## +def gbk2fasta(gbk): + f=open("tmp_gbk.fa", "a") + for seq_record in SeqIO.parse(gbk, 'genbank'): + print(f">{seq_record.id}\n{seq_record.seq}\n", file=f) + +############################################ +### Function to filter gbk based on hits ### +############################################ +def filtergbk(gbk, out, extension): + + f=open(f"{out}", "w") + # Read blast results + blast_res = pd.read_csv('out.blast', sep = '\t') + print(blast_res) + + # Get locus_tags + contigs = sorted(set(blast_res["sseqid"].tolist())) + + # Subset + for contig in contigs: + for seq_record in SeqIO.parse(gbk, 'genbank'): + if str(contig) == str(seq_record.id): + filtered = [] + small_df = blast_res[blast_res['sseqid'].isin([seq_record.id])] + for index, row in small_df.iterrows(): + for features in seq_record.features: + # plus strand + if int(row["sstart"]) <= int(row["send"]): + if int(features.location.start) >= int(row["sstart"] - extension) and int(features.location.start) <= int(row["send"] + extension) and features.type != "source": + filtered.append(features) + # minus strand + elif int(row["sstart"]) >= int(row["send"]): + if int(features.location.start) >= int(row["send"] - extension) and int(features.location.start) <= int(row["sstart"] + extension) and features.type != "source": + filtered.append(features) + + # Print results + seq_record.features = filtered + if len(seq_record.features) > 0: + SeqIO.write(seq_record, f, 'gb') + else: + pass diff --git a/falmeida_py/bacannot2json.py b/falmeida_py/bacannot2json.py new file mode 100644 index 0000000..b4a8b75 --- /dev/null +++ b/falmeida_py/bacannot2json.py @@ -0,0 +1,131 @@ +#!/usr/bin/env python3 +# coding: utf-8 + +######################## +### Def help message ### +######################## +usage_bacannot2json = """ +A script to summarize the main annotation results of fmalmeida/bacannot pipeline as a structured JSON file. + +--- +Copyright (C) 2022 Felipe Marques de Almeida (almeidafmarques@gmail.com) +License: Public Domain + +Usage: + falmeida-py bacannot2json [ -h|--help ] + falmeida-py bacannot2json [ --input --output --print ] + +Options: + -h --help Show this screen. + -i --input= Path to bacannot results folder. + -o --output= JSON summary output file [Default: bacannot_summary.json]. + -p --print Also print resolved JSON to stdout. +""" + +################################## +### Loading Necessary Packages ### +################################## +from docopt import docopt +import pandas as pd +import json +import simplejson +import os +import yaml +from pathlib import Path +from .utils import find_files +from .general_stats_function import * +from .plasmid_function import * +from .virulence_function import * +from .resistance_function import * +from .mges_function import * + +############################## +### fix keys in dictionary ### +############################## +def convert_dictkey(d): + ###change all keys in a dict d + return { str(k): convert_dictvalue(v) for k,v in d.items() } + +def convert_dictvalue(v): + ###if v is a dict do convert_dictkey() for v, else raise v + if isinstance(v, dict): + return convert_dictkey(v) + else: + return v + +def stringify_keys(d): + """Convert a dict's keys to strings if they are not.""" + return convert_dictkey(d) + +############################################### +### based on annotations figure sample name ### +############################################### +def get_samples(filepaths): + samples = [] + for annotation in filepaths: + if not 'jbrowse' in str(annotation): + sample = annotation.split('/')[-2] + samples.append(sample) + return samples + +############################# +### initialize dictionary ### +############################# +def dict_init(indir): + # main dictionary for json final output + bacannot_summary = {} + + # detect available sample annotations + available_annotations = [ str(x) for x in find_files(start_dir=indir, pattern='annotation') ] + + # detect available samples + available_samples = [ str(x) for x in get_samples(available_annotations) ] + + # initiate first dictionary level + for sample in available_samples: + bacannot_summary[sample] = {} + bacannot_summary[sample]['sample'] = f"{sample}" + bacannot_summary[sample]['results_dir'] = f"{indir}/{sample}" + + return bacannot_summary + +####################################### +### Def main bacannot2json function ### +####################################### +def bacannot2json(indir, outfile, check): + + # initialize + bacannot_dir = os.path.abspath( indir ) + bacannot_summary = dict_init( bacannot_dir ) + + # check general annotation stats + general_stats( bacannot_summary ) + + # check virulence annotation stats + virulence_stats( bacannot_summary ) + + # check MGEs annotation stats + mges_stats( bacannot_summary ) + + # check plasmids annotation stats + plasmids_stats( bacannot_summary ) + + # check resistance annotation stats + resistance_stats( bacannot_summary ) + + # save results + final_results = simplejson.dumps( + stringify_keys( bacannot_summary ), + sort_keys=True, + indent=4, + ignore_nan=True + ) + with open(outfile, 'w') as file: + file.write(final_results) + + # keep checking + if check: + print( final_results ) + + # bye bye + print(f"==> Output generated and saved at:\n\t{outfile}") \ No newline at end of file diff --git a/falmeida_py/blasts.py b/falmeida_py/blasts.py new file mode 100644 index 0000000..9fe9de4 --- /dev/null +++ b/falmeida_py/blasts.py @@ -0,0 +1,104 @@ +#!/usr/bin/env python3 +# coding: utf-8 + +## Def help message +usage_blasts = """ +A simple script to automatize the execution and filtering of blast alignments. + +--- +Copyright (C) 2020 Felipe Marques de Almeida (almeidafmarques@gmail.com) +License: Public Domain +Usage: + falmeida-py blasts + falmeida-py blasts -h|--help + falmeida-py blasts -v|--version + falmeida-py blasts ( --query --subject ) [ --task --minid --mincov --culling_limit --out --threads --2way ] + +Options: + -h --help Show this screen. + -v --version Show version information + --task= Select which task to run (blastn, blastp, tblastn or blastx) [Default: blastn]. + --2way Sets the pipeline to filter alignments by coverage in a 2way manner. + Which means an alignment must cover at least n from the query and + subject lengths. Otherwise it just needs to cover n from query seq. + This method is good when comparing query genes to subject genes. + --query= Query fasta file. + --subject= Subject fasta file. + --minid= Min. Identity percentage for gene annotation [Default: 80] + --mincov= Min. Covereage for gene annotation [Default: 80] + --culling_limit= Blast culling_limit for best hit only [Default: 1] + --out= File for saving blast outputs [Default: out.blast] + --threads= Number of threads to be used [Default: 1] +""" + +################################## +### Loading Necessary Packages ### +################################## +import pandas as pd +import os + +###################### +### BLAST FUNCTION ### +###################### +def blast(task, query, subject, culling, minid, mincov, out, threads, twoway): + + # Outfmt + outfmt="6 qseqid qstart qend qlen sseqid sstart send slen evalue length pident gaps gapopen bitscore" + + # format header + os.system(f"echo \"qseqid\tqstart\tqend\tqlen\tsseqid\tsstart\tsend\tslen\tevalue\tlength\tpident\tgaps\tgapopen\tbitscore\" > {out}") + + # format db + if task == "blastn" or task == "tblastn": + db_type = "nucl" + elif task == "blastx" or task == "blastp": + db_type = "prot" + os.system(f"makeblastdb -in {subject} -out ./FA-PY-SUBJECT-DB -dbtype {db_type} 1> /dev/null") + + # run blast + if twoway: + os.system(f"{task} -query {query} -db ./FA-PY-SUBJECT-DB -outfmt \"{outfmt}\" -num_threads {threads} -culling_limit {culling} | \ + awk -v minid={minid} -v mincov={mincov} '{{ if ($11 >= minid && (($10 - $12) / $8 * 100) >= mincov && (($10 - $12) / $4 * 100) >= mincov) {{print $0}} }}' >> {out} ") + else: + os.system(f"{task} -query {query} -db ./FA-PY-SUBJECT-DB -outfmt \"{outfmt}\" -num_threads {threads} -culling_limit {culling} | \ + awk -v minid={minid} -v mincov={mincov} '{{ if ($11 >= minid && (($10 - $12) / $4 * 100) >= mincov) {{print $0}} }}' >> {out} ") + + # clear work dir + os.system("rm -rf ./FA-PY-SUBJECT-DB*") + +######################## +### Summary function ### +######################## +def summary(output): + + # Outfmt + columns="QUERY\tQUERY_START\tQUERY_END\tQUERY_STRAND\t%QUERY_COV\tSUBJECT\tSUBJECT_START\tSUBJECT_END\tSUBJECT_STRAND\t%SUBJECT_COV\t%IDENTITY\tGAPS" + + # Summary + blast = pd.read_csv(output, sep="\t") + print(columns) + for index, line in blast.iterrows(): + # Query strand + if (line['qstart'] > line['qend']): + strand="-" + else: + strand="+" + # Subject strand + if (line['sstart'] > line['send']): + sstrand='-' + else: + sstrand='+' + # Parse headers + subject=line["sseqid"] + # Query coverage + qcov=round((100 * (line["length"] - line["gaps"]) / line["qlen"]), 2) + # Subject coverage + scov=round((100 * (line["length"] - line["gaps"]) / line["slen"]), 2) + # Identity + id=round(line["pident"], 2) + # Gaps + gaps=str(line["gapopen"]) + "/" + str(line["gaps"]) + + # Print + print(line["qseqid"], line["qstart"], line["qend"], strand, qcov, + subject, line["sstart"], line["send"], sstrand, scov, id, gaps, sep = "\t") diff --git a/falmeida_py/gbk2fasta.py b/falmeida_py/gbk2fasta.py new file mode 100644 index 0000000..a061da2 --- /dev/null +++ b/falmeida_py/gbk2fasta.py @@ -0,0 +1,70 @@ +#!/usr/bin/env python3 +# coding: utf-8 + +## Def help message +usage_gbk2fasta = """ +A simple script to automatize the execution and filtering of blast alignments. + +--- +Copyright (C) 2021 Felipe Marques de Almeida (almeidafmarques@gmail.com) +License: Public Domain +Usage: + falmeida-py gbk2fasta + falmeida-py gbk2fasta -h|--help + falmeida-py gbk2fasta -v|--version + falmeida-py gbk2fasta ( --gbk ) [ --out --fofn --type ] + +Options: + -h --help Show this screen. + -v --version Show version information + -g --gbk Genbank file to be converted to fasta + -o --out Output fasta file [Default: stdout] + -f --fofn File with the list of genes to be extracted. One gene per line, using the 'locus_tag' field. + -t --type Type of sequence to output genes: nucl or prot [Default: prot] +""" + +################################## +### Loading Necessary Packages ### +################################## +from Bio import SeqIO + +######################################### +### load list of genes as python list ### +######################################### +def load_gene_list(genes_list): + """ + Loads a list of genes from a file. + """ + genes = [] + with open(genes_list, 'r') as f: + for line in f: + genes.append(line.strip()) + return genes + +################################################# +### prints a fasta from gbk biopython feature ### +################################################# +def print_fasta(feature, genes_list): + if genes_list == None: + print(f">{feature.qualifiers['locus_tag'][0]} {feature.qualifiers['product'][0]}") + print(feature.qualifiers['translation'][0]) + else: + if feature.qualifiers['locus_tag'][0] in genes_list: + print(f">{feature.qualifiers['locus_tag'][0]} {feature.qualifiers['product'][0]}") + print(feature.qualifiers['translation'][0]) + +########################################### +### loads and converts genbank to fasta ### +########################################### +def convertgbk(genbank, genes_list): + """ + Loads a genbank file and converts it to fasta. + """ + if genes_list: + genes = load_gene_list(genes_list) + else: + genes = None + for record in SeqIO.parse(genbank, "genbank"): + for feature in record.features: + if feature.type == "CDS": + print_fasta(feature, genes) \ No newline at end of file diff --git a/falmeida_py/general_stats_function.py b/falmeida_py/general_stats_function.py new file mode 100644 index 0000000..80e2590 --- /dev/null +++ b/falmeida_py/general_stats_function.py @@ -0,0 +1,48 @@ +################################## +### Loading Necessary Packages ### +################################## +import pandas as pd +import yaml +from pathlib import Path + +################################ +### check general annotation ### +################################ +def general_stats(bacannot_summary): + + # iterate over available samples + for sample in bacannot_summary: + + # load dir of samples' results + results_dir = bacannot_summary[sample]['results_dir'] + + # load annotation stats + general_results = yaml.safe_load( + Path(f"{results_dir}/annotation/{sample}.txt").read_text() + ) + + # load MLST + mlst_results = pd.read_csv( + f"{results_dir}/MLST/{sample}_mlst_analysis.txt", + sep='\t', header=None + ) + + # load refseq_masher + refseq_masher_results = pd.read_csv( + f"{results_dir}/refseq_masher/refseq_masher_results.txt", + sep='\t' + ) + refseq_masher_results.sort_values(by='distance', ascending=True, inplace=True) + + # save annotation stats + bacannot_summary[sample]['general_annotation'] = {} + bacannot_summary[sample]['general_annotation']['mlst'] = str(mlst_results[2].item()).replace('-', 'null') + bacannot_summary[sample]['general_annotation']['cds'] = general_results.get('CDS', 0) + bacannot_summary[sample]['general_annotation']['rrna'] = general_results.get('rRNA', 0) + bacannot_summary[sample]['general_annotation']['trna'] = general_results.get('tRNA', 0) + bacannot_summary[sample]['general_annotation']['tmrna'] = general_results.get('tmRNA', 0) + + bacannot_summary[sample]['general_annotation']['closest_reference'] = {} + bacannot_summary[sample]['general_annotation']['closest_reference']['strain'] = refseq_masher_results.head(1)['top_taxonomy_name'].item() + bacannot_summary[sample]['general_annotation']['closest_reference']['distance'] = refseq_masher_results.head(1)['distance'].item() + bacannot_summary[sample]['general_annotation']['closest_reference']['accession'] = refseq_masher_results.head(1)['assembly_accession'].item() \ No newline at end of file diff --git a/falmeida_py/mges_function.py b/falmeida_py/mges_function.py new file mode 100644 index 0000000..39014e0 --- /dev/null +++ b/falmeida_py/mges_function.py @@ -0,0 +1,169 @@ +################################## +### Loading Necessary Packages ### +################################## +import pandas as pd +import os +from .utils import load_and_subset_gff + +#################################### +### check MGEs annotations stats ### +#################################### +def mges_stats(bacannot_summary): + + # iterate over available samples + for sample in bacannot_summary: + + # load dir of samples' results + results_dir = bacannot_summary[sample]['results_dir'] + + # load gff_file + gff_file = f"{results_dir}/gffs/{sample}.gff" + + # integron_finder + if os.path.exists(f"{results_dir}/integron_finder/{sample}_integrons.gff") and os.stat(f"{results_dir}/integron_finder/{sample}_integrons.gff").st_size > 0: + + # init MGE annotation dictionary + if 'MGE' not in bacannot_summary[sample]: + bacannot_summary[sample]['MGE'] = {} + + # init integron_finder annotation dictionary + bacannot_summary[sample]['MGE']['integron_finder'] = {} + + # load integron_finder results + results = pd.read_csv( + f"{results_dir}/integron_finder/{sample}_integrons.gff", + sep='\t', + header=None, + names=[ + 'chr', 'source', 'type', 'start', 'end', 'score', 'strand', 'frame', 'atts' + ] + ) + + # number of integron_finder annotations + total_number = len(results.index) + bacannot_summary[sample]['MGE']['integron_finder']['total'] = total_number + + # per integron info + bacannot_summary[sample]['MGE']['integron_finder'] = {} + if int(results.shape[0]) > 0: + for seq in [ str(x) for x in results['chr'].unique() ]: + + bacannot_summary[sample]['MGE']['integron_finder'][seq] = {} + for index, row in results[results['chr'] == seq].reset_index().iterrows(): + id = row['atts'].split(';')[0].split('=')[-1] + bacannot_summary[sample]['MGE']['integron_finder'][seq][id] = {} + bacannot_summary[sample]['MGE']['integron_finder'][seq][id]['id'] = id + bacannot_summary[sample]['MGE']['integron_finder'][seq][id]['contig'] = row['chr'] + bacannot_summary[sample]['MGE']['integron_finder'][seq][id]['start'] = row['start'] + bacannot_summary[sample]['MGE']['integron_finder'][seq][id]['end'] = row['end'] + bacannot_summary[sample]['MGE']['integron_finder'][seq][id]['type'] = row['atts'].split(';')[1].split('=')[-1] + bacannot_summary[sample]['MGE']['integron_finder'][seq][id]['source'] = row['source'] + bacannot_summary[sample]['MGE']['integron_finder'][seq][id]['product'] = row['type'] + + # ICEberg database + ice_db_blastp = f"{results_dir}/ICEs/{sample}_iceberg_blastp_onGenes.summary.txt" + if os.path.exists(ice_db_blastp) and os.stat(ice_db_blastp).st_size > 0: + + # init MGE annotation dictionary + if 'MGE' not in bacannot_summary[sample]: + bacannot_summary[sample]['MGE'] = {} + + # init iceberg annotation dictionary + if 'ICE' not in bacannot_summary[sample]['MGE']: + bacannot_summary[sample]['MGE']['ICEberg'] = {} + + # init iceberg blastp annotation dictionary + bacannot_summary[sample]['MGE']['ICEberg']['blastp'] = {} + + # load integron_finder results + results = pd.read_csv( + ice_db_blastp, + sep='\t' + ) + + # load gff + gff = load_and_subset_gff(gff_file, 'source', 'ICEberg') + + # number of integron_finder annotations + total_number = len(results.index) + bacannot_summary[sample]['MGE']['ICEberg']['blastp']['total'] = total_number + + # per gene info + if int(results.shape[0]) > 0: + for seq in [ str(x) for x in results['SEQUENCE'].unique() ]: + + # details missing in output but available in gff + gff_row = gff[gff['attributes'].str.contains(seq)] + contig = gff_row['seq'].item() + start = gff_row['start'].item() + end = gff_row['end'].item() + + bacannot_summary[sample]['MGE']['ICEberg']['blastp'][seq] = {} + for index, row in results[results['SEQUENCE'] == seq].reset_index().iterrows(): + bacannot_summary[sample]['MGE']['ICEberg']['blastp'][seq] = {} + bacannot_summary[sample]['MGE']['ICEberg']['blastp'][seq]['id'] = row['ICEBERG_ID'] + bacannot_summary[sample]['MGE']['ICEberg']['blastp'][seq]['contig'] = contig + bacannot_summary[sample]['MGE']['ICEberg']['blastp'][seq]['start'] = start + bacannot_summary[sample]['MGE']['ICEberg']['blastp'][seq]['end'] = end + bacannot_summary[sample]['MGE']['ICEberg']['blastp'][seq]['accession'] = row['ACCESSION'] + bacannot_summary[sample]['MGE']['ICEberg']['blastp'][seq]['product'] = row['PRODUCT'] + bacannot_summary[sample]['MGE']['ICEberg']['blastp'][seq]['description'] = row['DESCRIPTION'] + bacannot_summary[sample]['MGE']['ICEberg']['blastp'][seq]['blast_start'] = row['START'] + bacannot_summary[sample]['MGE']['ICEberg']['blastp'][seq]['blast_end'] = row['END'] + bacannot_summary[sample]['MGE']['ICEberg']['blastp'][seq]['blast_identity'] = row['%IDENTITY'] + bacannot_summary[sample]['MGE']['ICEberg']['blastp'][seq]['blast_coverage'] = row['%COVERAGE'] + bacannot_summary[sample]['MGE']['ICEberg']['blastp'][seq]['strand'] = row['STRAND'] + + # PHAST database + phast_db_blastp = f"{results_dir}/prophages/phast_db/{sample}_phast_blastp_onGenes.summary.txt" + if os.path.exists(phast_db_blastp) and os.stat(phast_db_blastp).st_size > 0: + + # init MGE annotation dictionary + if 'MGE' not in bacannot_summary[sample]: + bacannot_summary[sample]['MGE'] = {} + + # init phast annotation dictionary + if 'ICE' not in bacannot_summary[sample]['MGE']: + bacannot_summary[sample]['MGE']['PHAST'] = {} + + # init phast blastp annotation dictionary + bacannot_summary[sample]['MGE']['PHAST']['blastp'] = {} + + # load integron_finder results + results = pd.read_csv( + phast_db_blastp, + sep='\t' + ) + + # load gff + gff = load_and_subset_gff(gff_file, 'source', 'PHAST') + + # number of integron_finder annotations + total_number = len(results.index) + bacannot_summary[sample]['MGE']['PHAST']['blastp']['total'] = total_number + + # per gene info + if int(results.shape[0]) > 0: + for seq in [ str(x) for x in results['SEQUENCE'].unique() ]: + + # details missing in output but available in gff + gff_row = gff[gff['attributes'].str.contains(seq)] + contig = gff_row['seq'].item() + start = gff_row['start'].item() + end = gff_row['end'].item() + + bacannot_summary[sample]['MGE']['PHAST']['blastp'][seq] = {} + for index, row in results[results['SEQUENCE'] == seq].reset_index().iterrows(): + bacannot_summary[sample]['MGE']['PHAST']['blastp'][seq] = {} + bacannot_summary[sample]['MGE']['PHAST']['blastp'][seq]['id'] = row['PHAST_ID'] + bacannot_summary[sample]['MGE']['PHAST']['blastp'][seq]['contig'] = contig + bacannot_summary[sample]['MGE']['PHAST']['blastp'][seq]['start'] = start + bacannot_summary[sample]['MGE']['PHAST']['blastp'][seq]['end'] = end + bacannot_summary[sample]['MGE']['PHAST']['blastp'][seq]['accession'] = row['ACCESSION'] + bacannot_summary[sample]['MGE']['PHAST']['blastp'][seq]['gene'] = row['GENE'] + bacannot_summary[sample]['MGE']['PHAST']['blastp'][seq]['description'] = row['DESCRIPTION'] + bacannot_summary[sample]['MGE']['PHAST']['blastp'][seq]['blast_start'] = row['START'] + bacannot_summary[sample]['MGE']['PHAST']['blastp'][seq]['blast_end'] = row['END'] + bacannot_summary[sample]['MGE']['PHAST']['blastp'][seq]['blast_identity'] = row['%IDENTITY'] + bacannot_summary[sample]['MGE']['PHAST']['blastp'][seq]['blast_coverage'] = row['%COVERAGE'] + bacannot_summary[sample]['MGE']['PHAST']['blastp'][seq]['strand'] = row['STRAND'] \ No newline at end of file diff --git a/falmeida_py/mpgap2csv.py b/falmeida_py/mpgap2csv.py new file mode 100644 index 0000000..9a6e873 --- /dev/null +++ b/falmeida_py/mpgap2csv.py @@ -0,0 +1,119 @@ +#!/usr/bin/env python3 +# coding: utf-8 + +######################## +### Def help message ### +######################## +usage_mpgap2csv = """ +A simple to generate metadata .csv for quickly producing tables for papers. Uses statistics calculated with quast and busco, condensed in multiqc file, generated with fmalmeida/MpGAP pipeline. + +--- +Copyright (C) 2022 Felipe Marques de Almeida (almeidafmarques@gmail.com) +License: Public Domain + +Usage: + falmeida-py mpgap2csv [ -h|--help ] [ --input --output ] + +Options: + -h --help Show this screen. + --input= Path to MpGAP outdir. + --output= File to save results (CSV). [Default: MpGAP_summary.csv] +""" + +################################## +### Loading Necessary Packages ### +################################## +from docopt import docopt +import pandas as pd +import os +from pathlib import Path +import json +from pprint import pprint + +################################### +### Defifining useful functions ### +################################### +def find_multiqc_file(dir): + matches = [] + for path in Path(dir).rglob('multiqc_data.json'): + matches.append(os.path.abspath(path.resolve())) + return matches + +def split_and_retrieve_desired_values(item): + values=str(item).split('/') + sample=values[-5] + method=values[-4] + del values[-5:] + outdir='/'.join(values) + + return sample, method, outdir + +def parse_json(data, assembler, field, item): + + return data['report_saved_raw_data'][field][assembler][item] + +def get_sample_and_assembly_info(files, df, quast_cols, busco_cols, base_columns): + + for idx, item in enumerate(files): + + sample, method, outdir = split_and_retrieve_desired_values(item) + + with open(item) as json_file: + data = json.load(json_file) + assemblies = list(data['report_general_stats_data'][0].keys()) + + # quast desires + quast_selection = [] + for index, val in enumerate(assemblies): + quast_selection.append( val ) + for selection in quast_cols: + quast_selection.append( parse_json( + data, val, 'multiqc_quast', selection + ) ) + quast_selection = [quast_selection[n:n+10] for n in range(0, len(quast_selection), 10)] + + # busco desires + busco_selection = [] + for index, val in enumerate(assemblies): + busco_selection.append( val ) + for selection in busco_cols: + busco_selection.append( parse_json( + data, val, 'multiqc_busco', selection + ) ) + busco_selection = [busco_selection[n:n+7] for n in range(0, len(busco_selection), 7)] + + for index, val in enumerate(assemblies): + final = [sample, method, outdir, item, val] + quast_selection[index][1:] + busco_selection[index][1:] + df.loc[len(df)] = final + # print(final) + + + +#################### +## Defining main ### +#################### +def mpgap2csv(indir, output): + + base_columns = [ "sample", "method", "outdir", "multiqc_file", "software" ] + desired_quast_columns = [ + "# contigs", "N50", "Total length", + "# total reads", "Properly paired (%)", "Avg. coverage depth", + "# predicted rRNA genes", "Complete BUSCO (%)", "Partial BUSCO (%)" + ] + desired_busco_columns = [ + "complete_single_copy", "complete_duplicated", "fragmented", + "missing", "total", "lineage_dataset" + ] + multiqc_files_df = pd.DataFrame(columns = list(base_columns + desired_quast_columns + desired_busco_columns)) + + get_sample_and_assembly_info( + find_multiqc_file(indir), + multiqc_files_df, + desired_quast_columns, + desired_busco_columns, + base_columns + ) + + # save file + multiqc_files_df.to_csv(output, index=False) + print(f"=> Saved results in:\n\t{output}") diff --git a/falmeida_py/plasmid_function.py b/falmeida_py/plasmid_function.py new file mode 100644 index 0000000..789aa9a --- /dev/null +++ b/falmeida_py/plasmid_function.py @@ -0,0 +1,133 @@ +################################## +### Loading Necessary Packages ### +################################## +import pandas as pd +import os + +######################################## +### check plasmids annotations stats ### +######################################## +def plasmids_stats(bacannot_summary): + + # iterate over available samples + for sample in bacannot_summary: + + # load dir of samples' results + results_dir = bacannot_summary[sample]['results_dir'] + + # init plasmids annotation dictionary + bacannot_summary[sample]['plasmid'] = {} + + # platon + if os.path.exists(f"{results_dir}/plasmids/platon/{sample}.tsv") and os.stat(f"{results_dir}/plasmids/platon/{sample}.tsv").st_size > 0: + + # init platon annotation dictionary + bacannot_summary[sample]['plasmid']['platon'] = {} + + # load platon results + results = pd.read_csv( + f"{results_dir}/plasmids/platon/{sample}.tsv", + sep='\t' + ) + + # number of plasmid annotations + total_number = len(results.index) + bacannot_summary[sample]['plasmid']['platon']['total'] = total_number + + # per plasmid info + if int(results.shape[0]) > 0: + for seq in [x for x in results['ID'].unique()]: + bacannot_summary[sample]['plasmid']['platon'][seq] = {} + bacannot_summary[sample]['plasmid']['platon'][seq]['Length'] = results.loc[results['ID'] == seq, 'Length'].item() + bacannot_summary[sample]['plasmid']['platon'][seq]['ORFs'] = results.loc[results['ID'] == seq, '# ORFs'].item() + bacannot_summary[sample]['plasmid']['platon'][seq]['Circular'] = results.loc[results['ID'] == seq, 'Circular'].item() + bacannot_summary[sample]['plasmid']['platon'][seq]['AMRs'] = results.loc[results['ID'] == seq, '# AMRs'].item() + bacannot_summary[sample]['plasmid']['platon'][seq]['Replication'] = results.loc[results['ID'] == seq, '# Replication'].item() + bacannot_summary[sample]['plasmid']['platon'][seq]['Mobilization'] = results.loc[results['ID'] == seq, '# Mobilization'].item() + bacannot_summary[sample]['plasmid']['platon'][seq]['Conjugation'] = results.loc[results['ID'] == seq, '# Conjugation'].item() + + # plasmidfinder + if os.path.exists(f"{results_dir}/plasmids/plasmidfinder/results_tab.tsv") and os.stat(f"{results_dir}/plasmids/plasmidfinder/results_tab.tsv").st_size > 0: + + # init platon annotation dictionary + bacannot_summary[sample]['plasmid']['plasmidfinder'] = {} + + # load platon results + results = pd.read_csv( + f"{results_dir}/plasmids/plasmidfinder/results_tab.tsv", + sep='\t' + ) + + if not results.empty: + + # databases + print( results['Database'].unique() ) + bacannot_summary[sample]['plasmid']['plasmidfinder']['meta'] = {} + db_arr = results['Database'].unique() + bacannot_summary[sample]['plasmid']['plasmidfinder']['meta']['database'] = db_arr.tolist() if len(db_arr) > 1 else db_arr.item() + + # number of plasmid annotations + total_number = len(results['Contig'].unique()) + bacannot_summary[sample]['plasmid']['plasmidfinder']['total'] = total_number + + # plasmid annotations contigs + for seq in [ str(x) for x in results['Contig'].unique() ]: + bacannot_summary[sample]['plasmid']['plasmidfinder'][seq] = {} + bacannot_summary[sample]['plasmid']['plasmidfinder'][seq]['inc_types'] = {} + bacannot_summary[sample]['plasmid']['plasmidfinder'][seq]['identity'] = {} + bacannot_summary[sample]['plasmid']['plasmidfinder'][seq]['accession'] = {} + + for index, row in results.iterrows(): + contig = str(row['Contig']) + bacannot_summary[sample]['plasmid']['plasmidfinder'][contig]['inc_types'] = row['Plasmid'] + bacannot_summary[sample]['plasmid']['plasmidfinder'][contig]['identity'] = row['Identity'] + bacannot_summary[sample]['plasmid']['plasmidfinder'][contig]['accession'] = row['Accession number'] + + # mob suite + if os.path.exists(f"{results_dir}/plasmids/mob_suite/{sample}_mobtyper_results.txt") and os.stat(f"{results_dir}/plasmids/mob_suite/{sample}_mobtyper_results.txt").st_size > 0: + + # init integron_finder annotation dictionary + bacannot_summary[sample]['plasmid']['mob_suite'] = {} + + # load integron_finder results + results = pd.read_csv( + f"{results_dir}/plasmids/mob_suite/{sample}_mobtyper_results.txt", + sep='\t', + header='infer', + # sample_id num_contigs size gc md5 rep_type(s) rep_type_accession(s) relaxase_type(s) relaxase_type_accession(s) mpf_type mpf_type_accession(s) orit_type(s) orit_accession(s) predicted_mobility mash_nearest_neighbor mash_neighbor_distance mash_neighbor_identification primary_cluster_id secondary_cluster_id predicted_host_range_overall_rank predicted_host_range_overall_name observed_host_range_ncbi_rank observed_host_range_ncbi_name reported_host_range_lit_rank reported_host_range_lit_name associated_pmid(s) + ) + + # number of plasmid types annotated annotations + # total_number = len(results.index) - 1 # always counts chromosome + # bacannot_summary[sample]['plasmid']['mob_suite']['total'] = total_number + + # per integron info + bacannot_summary[sample]['plasmid']['mob_suite'] = {} + if int(results.shape[0]) > 0: + for seq in [ str(x) for x in results['sample_id'].unique() ]: + + bacannot_summary[sample]['plasmid']['mob_suite'][seq] = {} + for index, row in results[results['sample_id'].astype(str) == seq].reset_index().iterrows(): + id = row['sample_id'] # they are the same for this result + bacannot_summary[sample]['plasmid']['mob_suite'][seq][id] = {} + bacannot_summary[sample]['plasmid']['mob_suite'][seq][id]['size'] = row['size'] + bacannot_summary[sample]['plasmid']['mob_suite'][seq][id]['rep_type'] = row['rep_type(s)'].replace(',', '; ') + bacannot_summary[sample]['plasmid']['mob_suite'][seq][id]['rep_type_accession'] = row['rep_type_accession(s)'].replace(',', '; ') + bacannot_summary[sample]['plasmid']['mob_suite'][seq][id]['relaxase_type'] = row['relaxase_type(s)'] + bacannot_summary[sample]['plasmid']['mob_suite'][seq][id]['relaxase_type_accession(s)'] = row['relaxase_type_accession(s)'] + bacannot_summary[sample]['plasmid']['mob_suite'][seq][id]['mpf_type'] = row['mpf_type'] + bacannot_summary[sample]['plasmid']['mob_suite'][seq][id]['mpf_type_accession'] = row['mpf_type_accession(s)'] + bacannot_summary[sample]['plasmid']['mob_suite'][seq][id]['orit_type'] = row['orit_type(s)'] + bacannot_summary[sample]['plasmid']['mob_suite'][seq][id]['orit_accession'] = row['orit_accession(s)'] + bacannot_summary[sample]['plasmid']['mob_suite'][seq][id]['mash_nearest_neighbor'] = row['mash_nearest_neighbor'] + bacannot_summary[sample]['plasmid']['mob_suite'][seq][id]['mash_neighbor_distance'] = row['mash_neighbor_distance'] + bacannot_summary[sample]['plasmid']['mob_suite'][seq][id]['mash_neighbor_identification'] = row['mash_neighbor_identification'] + bacannot_summary[sample]['plasmid']['mob_suite'][seq][id]['primary_cluster_id'] = row['primary_cluster_id'] + bacannot_summary[sample]['plasmid']['mob_suite'][seq][id]['secondary_cluster_id'] = row['secondary_cluster_id'] + bacannot_summary[sample]['plasmid']['mob_suite'][seq][id]['predicted_host_range_overall_rank'] = row['predicted_host_range_overall_rank'] + bacannot_summary[sample]['plasmid']['mob_suite'][seq][id]['predicted_host_range_overall_name'] = row['predicted_host_range_overall_name'] + bacannot_summary[sample]['plasmid']['mob_suite'][seq][id]['observed_host_range_ncbi_rank'] = row['observed_host_range_ncbi_rank'] + bacannot_summary[sample]['plasmid']['mob_suite'][seq][id]['observed_host_range_ncbi_name'] = row['observed_host_range_ncbi_name'] + bacannot_summary[sample]['plasmid']['mob_suite'][seq][id]['reported_host_range_lit_rank'] = row['reported_host_range_lit_rank'] + bacannot_summary[sample]['plasmid']['mob_suite'][seq][id]['reported_host_range_lit_name'] = row['reported_host_range_lit_name'] + bacannot_summary[sample]['plasmid']['mob_suite'][seq][id]['associated_pmid'] = row['associated_pmid(s)'] \ No newline at end of file diff --git a/falmeida_py/replace_fasta_seq.py b/falmeida_py/replace_fasta_seq.py new file mode 100644 index 0000000..98399e9 --- /dev/null +++ b/falmeida_py/replace_fasta_seq.py @@ -0,0 +1,67 @@ +#!/usr/bin/env python3 +# coding: utf-8 + +######################## +### Def help message ### +######################## +usage_replace_fasta_seq = """ +A script meant to replace a string in a FASTA file using defitions in a BED file (tab-separated). + +--- +Copyright (C) 2021 Felipe Marques de Almeida (almeidafmarques@gmail.com) +License: Public Domain + +Usage: + falmeida-py replace_fasta_seq [ -h|--help ] + falmeida-py replace_fasta_seq [ --fasta --bed --out ] + +Options: + -h --help Show this screen. + -f --fasta= FASTA file to replace sequences in. + -o --out= Output FASTA file [Default: out.fasta]. + -b --bed= BED file with replacement definitions. + +Comments: + The BED (start 0-based) file MUST be a 4 column file with the following format: + contig start end sub_seq +""" + +################################## +### Loading Necessary Packages ### +################################## +from Bio import SeqIO + +########################## +### load fasta as dict ### +########################## +def fasta_as_dict(fasta): + fasta_dict = {} + for seq_record in SeqIO.parse(fasta, "fasta"): + fasta_dict[seq_record.id] = seq_record + return fasta_dict + +############################################# +### replace sequences using values in bed ### +############################################# +def replace_seq_in_dict(bed, dict): + with open(bed) as f: + for line in f: + contig, start, end, sub_seq = line.strip().split('\t') + dict[contig].seq = dict[contig].seq[:int(start)] + sub_seq + dict[contig].seq[int(end):] + +#################### +### output fasta ### +#################### +def output_fasta(dict, out): + with open(out, 'w') as f: + for record in dict.values(): + print(">" + record.id, file=f) + print(record.seq, file=f) + +##################### +### main function ### +##################### +def replace_fasta_seq(input, output, bed): + fasta_dict = fasta_as_dict(input) + replace_seq_in_dict(bed, fasta_dict) + output_fasta(fasta_dict, output) diff --git a/falmeida_py/resistance_function.py b/falmeida_py/resistance_function.py new file mode 100644 index 0000000..7f7ff01 --- /dev/null +++ b/falmeida_py/resistance_function.py @@ -0,0 +1,189 @@ +################################## +### Loading Necessary Packages ### +################################## +import pandas as pd +from .utils import find_files +import json +import os +import yaml +from pathlib import Path +from .utils import load_and_subset_gff +from pprint import pprint + +########################################## +### check resistance annotations stats ### +########################################## +def resistance_stats(bacannot_summary): + + # iterate over available samples + for sample in bacannot_summary: + + # load dir of samples' results + results_dir = bacannot_summary[sample]['results_dir'] + + # load gff_file + gff_file = f"{results_dir}/gffs/{sample}.gff" + + # load annotation stats + if os.path.exists(f"{results_dir}/resistance"): + + # init plasmids annotation dictionary + bacannot_summary[sample]['resistance'] = {} + + ########### + ### rgi ### + ########### + if os.path.exists(f"{results_dir}/resistance/RGI/RGI_{sample}.txt") and os.stat(f"{results_dir}/resistance/RGI/RGI_{sample}.txt").st_size > 0: + + # init rgi annotation dictionary + bacannot_summary[sample]['resistance']['rgi'] = {} + + # load rgi results + results = pd.read_csv( + f"{results_dir}/resistance/RGI/RGI_{sample}.txt", + sep='\t' + ) + results.drop_duplicates(inplace=True) + + # load gff + gff = load_and_subset_gff(gff_file, 'source', 'CARD') + + # number of annotations + total_number = len(results['ORF_ID'].unique()) + bacannot_summary[sample]['resistance']['rgi']['total'] = total_number + + # gene annotations + for gene in [ str(x) for x in results['ORF_ID'].unique() ]: + + # init values + row = results.loc[results['ORF_ID'] == gene] + name_split_list = gene.split(' ') + gene = name_split_list[0] + name = ' '.join(name_split_list[1:]) + bacannot_summary[sample]['resistance']['rgi'][gene] = {} + gff_row = gff[gff['attributes'].str.contains(f"ID={gene}")] + contig = gff_row['seq'].item() + start = gff_row['start'].item() + end = gff_row['end'].item() + + # add values to dict + bacannot_summary[sample]['resistance']['rgi'][gene]['name'] = name + bacannot_summary[sample]['resistance']['rgi'][gene]['gene'] = row['Best_Hit_ARO'].item() + bacannot_summary[sample]['resistance']['rgi'][gene]['card_aro'] = row['ARO'].item() + bacannot_summary[sample]['resistance']['rgi'][gene]['cut_off'] = row['Cut_Off'].item() + bacannot_summary[sample]['resistance']['rgi'][gene]['resistance_mechanism'] = row['Resistance Mechanism'].item() + bacannot_summary[sample]['resistance']['rgi'][gene]['gene_family'] = row['AMR Gene Family'].item() + bacannot_summary[sample]['resistance']['rgi'][gene]['subclass'] = row['Drug Class'].item() + bacannot_summary[sample]['resistance']['rgi'][gene]['identity'] = row['Best_Identities'].item() + bacannot_summary[sample]['resistance']['rgi'][gene]['accession'] = row['Model_ID'].item() + bacannot_summary[sample]['resistance']['rgi'][gene]['contig'] = contig + bacannot_summary[sample]['resistance']['rgi'][gene]['start'] = start + bacannot_summary[sample]['resistance']['rgi'][gene]['end'] = end + + + ##################### + ### amrfinderplus ### + ##################### + if os.path.exists(f"{results_dir}/resistance/AMRFinderPlus/AMRFinder_resistance-only.tsv") and os.stat(f"{results_dir}/resistance/AMRFinderPlus/AMRFinder_resistance-only.tsv").st_size > 0: + + # init amrfinderplus annotation dictionary + bacannot_summary[sample]['resistance']['amrfinderplus'] = {} + + # load amrfinderplus results + results = pd.read_csv( + f"{results_dir}/resistance/AMRFinderPlus/AMRFinder_resistance-only.tsv", + sep='\t' + ) + + # load gff + gff = load_and_subset_gff(gff_file, 'source', 'AMRFinderPlus') + + # number of annotations + total_number = len(results['Protein identifier'].unique()) + bacannot_summary[sample]['resistance']['amrfinderplus']['total'] = total_number + + # gene annotations + for gene in [ str(x) for x in results['Protein identifier'].unique() ]: + + # init values + row = results.loc[results['Protein identifier'] == gene] + bacannot_summary[sample]['resistance']['amrfinderplus'][gene] = {} + gene_name = row['Gene symbol'].item() + drug_class = row['Subclass'].item() + drug_type = row['Element type'].item() + identity = row['% Identity to reference sequence'].item() + + gff_row = gff[gff['attributes'].str.contains(f"ID={gene}")] + contig = gff_row['seq'].item() + start = gff_row['start'].item() + end = gff_row['end'].item() + + # add values to dict + bacannot_summary[sample]['resistance']['amrfinderplus'][gene]['gene'] = gene_name + bacannot_summary[sample]['resistance']['amrfinderplus'][gene]['subclass'] = drug_class + bacannot_summary[sample]['resistance']['amrfinderplus'][gene]['type'] = drug_type + bacannot_summary[sample]['resistance']['amrfinderplus'][gene]['identity'] = identity + bacannot_summary[sample]['resistance']['amrfinderplus'][gene]['contig'] = contig + bacannot_summary[sample]['resistance']['amrfinderplus'][gene]['start'] = start + bacannot_summary[sample]['resistance']['amrfinderplus'][gene]['end'] = end + + # + # check for rgi orthologies + # + try: + if gene in bacannot_summary[sample]['resistance']['rgi']: + bacannot_summary[sample]['resistance']['amrfinderplus'][gene]['card_aro'] = bacannot_summary[sample]['resistance']['rgi'][gene]['card_aro'] + else: + bacannot_summary[sample]['resistance']['amrfinderplus'][gene]['card_aro'] = None + except: + bacannot_summary[sample]['resistance']['amrfinderplus'][gene]['card_aro'] = None + + ################# + ### resfinder ### + ################# + # + # TODO: Include genomic coordinates info + # + if os.path.exists(f"{results_dir}/resistance/resfinder/ResFinder_results_tab.txt") and os.stat(f"{results_dir}/resistance/resfinder/ResFinder_results_tab.txt").st_size > 0: + + # init resfinder annotation dictionary + bacannot_summary[sample]['resistance']['resfinder'] = {} + + # load gff + gff = load_and_subset_gff(gff_file, 'source', 'Resfinder') + + # number of annotations + bacannot_summary[sample]['resistance']['resfinder']['total'] = len(gff.index) + + # since resfinder output does not has locus_tag information + # for this module, we will be using the resolved gff from bacannot + for index, row in gff.iterrows(): + + # init attributes as dict + attributes = dict() + for keyvaluepair in row['attributes'].split(';'): + items = keyvaluepair.split('=') + key = items[0] + value = '='.join(items[1:]) + attributes[key] = value + gene = attributes['ID'] + + # parse + bacannot_summary[sample]['resistance']['resfinder'][gene] = {} + bacannot_summary[sample]['resistance']['resfinder'][gene]['start'] = row['start'] + bacannot_summary[sample]['resistance']['resfinder'][gene]['end'] = row['end'] + # bacannot_summary[sample]['resistance']['resfinder'][gene]['Identity'] = row['Identity'] + bacannot_summary[sample]['resistance']['resfinder'][gene]['name'] = attributes['Resfinder_gene'] + bacannot_summary[sample]['resistance']['resfinder'][gene]['phenotype'] = attributes['Resfinder_phenotype'] + bacannot_summary[sample]['resistance']['resfinder'][gene]['accession'] = attributes['Resfinder_reference'] + + # + # check for rgi orthologies + # + try: + if gene in bacannot_summary[sample]['resistance']['rgi']: + bacannot_summary[sample]['resistance']['resfinder'][gene]['card_aro'] = bacannot_summary[sample]['resistance']['rgi'][gene]['card_aro'] + else: + bacannot_summary[sample]['resistance']['resfinder'][gene]['card_aro'] = None + except: + bacannot_summary[sample]['resistance']['resfinder'][gene]['card_aro'] = None diff --git a/falmeida_py/splitgbk.py b/falmeida_py/splitgbk.py new file mode 100644 index 0000000..71abdee --- /dev/null +++ b/falmeida_py/splitgbk.py @@ -0,0 +1,38 @@ +#!/usr/bin/env python +# coding: utf-8 + +## Def help message +usage_splitgbk = """ +A very simple script to split multisequence genbank files into separate files +Copyright (C) 2020 Felipe Marques de Almeida (almeidafmarques@gmail.com) +License: Public Domain + +usage: + falmeida-py splitgbk + falmeida-py splitgbk [ -h|--help ] + falmeida-py splitgbk [ --gbk ] [ --outdir ] + +options: + -h --help Show this screen. + -g --gbk= Input genbank file to split into multiple individual files. + -o --outdir= Directory (must already exist) in which to write the splitted files [Default: ./]. +""" + +################################## +### Loading Necessary Packages ### +################################## +from Bio import SeqIO +import os + +#################### +### GBK splitter ### +#################### +def splitgbk(gbk, outdir): + # just parse the dir + if type(outdir) == list: + outdir = outdir[0] + # exec biopython + for rec in SeqIO.parse(gbk, "genbank"): + SeqIO.write([rec], open(os.path.basename(os.path.normpath(outdir)) + "/" + rec.id + ".gbk", "w"), "genbank") + # finish + print(f"Done!\nIndividual files have been written at: {outdir}") diff --git a/falmeida_py/tsv2markdown.py b/falmeida_py/tsv2markdown.py new file mode 100644 index 0000000..5b49933 --- /dev/null +++ b/falmeida_py/tsv2markdown.py @@ -0,0 +1,52 @@ +#!/usr/bin/env python +# coding: utf-8 + +## Def help message +usage_tsv2markdown = """ +A simple script to convert tsv (or csv) files to markdown tables using tabulate! +Copyright (C) 2020 Felipe Marques de Almeida (almeidafmarques@gmail.com) +License: Public Domain + +usage: + falmeida-py tsv2markdown + falmeida-py tsv2markdown [ -h|--help ] + falmeida-py tsv2markdown [ --tsv --csv --header ] + +options: + -h --help Show this screen. + --tsv= Input tsv file to print as markdown table + --csv= Input csv file to print as markdown table + --header= If file does not have a header, set a + custom header. E.g. --header "Planet,R (km),mass (x 10^29 kg)". +""" + +################################## +### Loading Necessary Packages ### +################################## +from docopt import docopt +from tabulate import tabulate + +########################### +### Read header as list ### +########################### +def header2list(header): + return header.split(',') + +################################### +### Convert with header in file ### +################################### +def file2mw(filep, fsep, header): + + # read data.frame + with open(filep) as file_in: + lines = [] + for line in file_in: + words = line.split(fsep) + words_striped = [word.strip() for word in words] + lines.append(words_striped) + + # generate tabulate + if header: + print(tabulate(lines, headers=header2list(header), tablefmt="github")) + else: + print(tabulate(lines, headers="firstrow", tablefmt="github")) diff --git a/falmeida_py/utils.py b/falmeida_py/utils.py new file mode 100644 index 0000000..99e7111 --- /dev/null +++ b/falmeida_py/utils.py @@ -0,0 +1,24 @@ +from pathlib import Path +import pandas as pd + +def find_files(start_dir, pattern): + matches = [] + for path in Path(start_dir).rglob(pattern): + matches.append(path.resolve()) + return matches + +def load_and_subset_gff(file, col, pattern): + df = pd.read_csv( + file, sep='\t', + names=[ + "seq", "source", "type", "start", "end", + "score", "strand", "frame", "attributes" + ] + ) + + filter = df[col].str.contains(pattern) + + df = df[filter] + df.drop_duplicates(inplace=True) + + return df \ No newline at end of file diff --git a/falmeida_py/version.py b/falmeida_py/version.py new file mode 100644 index 0000000..e524e2d --- /dev/null +++ b/falmeida_py/version.py @@ -0,0 +1,20 @@ +""" +The version is stored here in a separate file so it can exist in only one place. +https://stackoverflow.com/a/7071358/2438989 + +Copyright 2022 Felipe Almeida (almeidafmarques@gmail.com) +https://github.com/fmalmeida/pythonScripts + +This file is part of my custom python scripts (falmeida-py) package, which is free: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by the Free Software Foundation, +either version 3 of the License, or (at your option) any later version. This package is distributed +in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more +details. You should have received a copy of the GNU General Public License along with falmeida-py package. +If not, see . +""" + +__version__ = '1.2.4' + +def get_version(): + return __version__ diff --git a/falmeida_py/virulence_function.py b/falmeida_py/virulence_function.py new file mode 100644 index 0000000..91e912d --- /dev/null +++ b/falmeida_py/virulence_function.py @@ -0,0 +1,114 @@ +################################## +### Loading Necessary Packages ### +################################## +import pandas as pd +from .utils import find_files +import json +import os +import yaml +from pathlib import Path +from .utils import load_and_subset_gff + +################################## +### check virulence annotation ### +################################## +def virulence_stats(bacannot_summary): + + # iterate over available samples + for sample in bacannot_summary: + + # load dir of samples' results + results_dir = bacannot_summary[sample]['results_dir'] + + # load gff_file + gff_file = f"{results_dir}/gffs/{sample}.gff" + + # load annotation stats + if os.path.exists(f"{results_dir}/virulence"): + + # init virulence annotation dictionary + bacannot_summary[sample]['virulence'] = {} + + # vfdb + if os.path.exists(f"{results_dir}/virulence/vfdb/{sample}_vfdb_blastn_onGenes.summary.txt") and os.stat(f"{results_dir}/virulence/vfdb/{sample}_vfdb_blastn_onGenes.summary.txt").st_size > 0: + + # init VFDB annotation dictionary + bacannot_summary[sample]['virulence']['VFDB'] = {} + + # load gff + gff = load_and_subset_gff(gff_file, 'source', 'VFDB') + + # load vfdb results + results = pd.read_csv( + f"{results_dir}/virulence/vfdb/{sample}_vfdb_blastn_onGenes.summary.txt", + sep='\t' + ) + + # number of virulence genes + total_number_of_genes = len( results['SEQUENCE'].unique() ) + bacannot_summary[sample]['virulence']['VFDB']['total'] = total_number_of_genes + + # gene annotations + for gene in [ str(x) for x in results['SEQUENCE'].unique() ]: + + # init values + bacannot_summary[sample]['virulence']['VFDB'][gene] = {} + row = results.loc[results['SEQUENCE'] == gene] + vf_name = row['PRODUCT'].item().split('_(')[0].replace("[", "") + vf_id = row['PRODUCT'].item().split('_(')[1].split(')')[0].replace(")", "") + vf_fullname = row['PRODUCT'].item().replace("[", "").replace("]", "") + gene_name = row['GENE'].item().replace(")", "").replace("(", "") + gff_row = gff[gff['attributes'].str.contains(gene)] + contig = gff_row['seq'].item() + start = gff_row['start'].item() + end = gff_row['end'].item() + + # add to dict + bacannot_summary[sample]['virulence']['VFDB'][gene]['virulence_factor'] = vf_name + bacannot_summary[sample]['virulence']['VFDB'][gene]['product'] = vf_fullname + bacannot_summary[sample]['virulence']['VFDB'][gene]['id'] = vf_id + bacannot_summary[sample]['virulence']['VFDB'][gene]['gene'] = gene_name + bacannot_summary[sample]['virulence']['VFDB'][gene]['chr'] = contig + bacannot_summary[sample]['virulence']['VFDB'][gene]['start'] = start + bacannot_summary[sample]['virulence']['VFDB'][gene]['end'] = end + + # victors + if os.path.exists(f"{results_dir}/virulence/victors/{sample}_victors_blastp_onGenes.summary.txt") and os.stat(f"{results_dir}/virulence/victors/{sample}_victors_blastp_onGenes.summary.txt").st_size > 0: + + # init victors annotation dictionary + gff = bacannot_summary[sample]['virulence']['Victors'] = {} + + # load victors results + results = pd.read_csv( + f"{results_dir}/virulence/victors/{sample}_victors_blastp_onGenes.summary.txt", + sep='\t' + ) + + # load gff + gff = load_and_subset_gff(gff_file, 'source', 'Victors') + + # number of virulence genes + total_number_of_genes = len( results['SEQUENCE'].unique() ) + bacannot_summary[sample]['virulence']['Victors']['total'] = total_number_of_genes + + # gene annotations + for gene in [ str(x) for x in results['SEQUENCE'].unique() ]: + + # init values + row = results.loc[results['SEQUENCE'] == gene] + bacannot_summary[sample]['virulence']['Victors'][gene] = {} + vf_id = row['VICTORS_ID'].item().replace("Victors_", "") + gene_name = row['GENE'].item() + product = row['DESCRIPTION'].item() + gff_row = gff[gff['attributes'].str.contains(gene)] + contig = gff_row['seq'].item() + start = gff_row['start'].item() + end = gff_row['end'].item() + + # add values to dict + bacannot_summary[sample]['virulence']['Victors'][gene]['id'] = vf_id + bacannot_summary[sample]['virulence']['Victors'][gene]['name'] = gene_name + bacannot_summary[sample]['virulence']['Victors'][gene]['product'] = product + bacannot_summary[sample]['virulence']['Victors'][gene]['contig'] = contig + bacannot_summary[sample]['virulence']['Victors'][gene]['start'] = start + bacannot_summary[sample]['virulence']['Victors'][gene]['end'] = end \ No newline at end of file diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..eff09bb --- /dev/null +++ b/requirements.txt @@ -0,0 +1,7 @@ +docopt +pandas +tabulate +biopython +simplejson +importlib_metadata +pyyaml diff --git a/rgitool_test/rgi2gff.py b/rgitool_test/rgi2gff.py deleted file mode 100644 index 07c245b..0000000 --- a/rgitool_test/rgi2gff.py +++ /dev/null @@ -1,91 +0,0 @@ -#!/usr/bin/env python3 - -'''rgi2gff.py last modified 2018-07-09 - -rgi2gff.py -b RGI_out.txt > output.gff3 - -RGI tabular output must be first parsed with: - -sed -i 's/ # /#/g' and sed -i 's/ /_/g' -''' - -# -import sys -import argparse -import time -from collections import defaultdict -# - -# Returns s truncated at the n'th (3rd by default) occurrence of the delimiter, d. -def trunc_at(s, d, n=3): - return d.join(s.split(d, n)[:n]) - - -def write_line(outlist, wayout): - outline = "\t".join(outlist) - print(outline, file=wayout) - - -def main(argv, wayout): - if not len(argv): - argv.append("-h") - - parser = argparse.ArgumentParser( - formatter_class=argparse.RawDescriptionHelpFormatter, description=__doc__) - parser.add_argument('-f', '--file', help="RGI txt output file") - args = parser.parse_args(argv) - - # counter for number of lines, and strand flips - linecounter, writecounter = 0, 0 - plusstrand, minusstrand = 0, 0 - - hitDictCounter = defaultdict(int) - print("Starting RGI parsing on %s" % (args.file), time.asctime(), file=sys.stderr) - for line in open(args.file, 'r'): - linecounter += 1 - ORF_ID, Contig, Start, Stop, Orientation, Cut_Off, Pass_Bitscore, Best_Hit_Bitscore, \ - Best_Hit_ARO, Best_Identities, ARO, Model_type, SNPs_in_Best_Hit_ARO, Other_SNPs, \ - Drug_Class, Resistance_Mechanism, AMR_Gene_Family, Predicted_DNA, Predicted_Protein, \ - CARD_Protein_Sequence, Percentage_Length_Reference_Sequence, ID, Model_id= line.rstrip().split("\t") - - # Reformat Contig ID - Contig = trunc_at(Contig, "_", n=2) - # Reformat Drug_Classes - Drug_Class = Drug_Class.replace(";_", "&") - # Reformat Resistance_Mechanism - Resistance_Mechanism = Resistance_Mechanism.replace(";_", "&") - # Define attributes - attributes = "Additional_database=CARD_RGI;CARD_gene_name={0};CARD_gene_family={1};CARD_ARO={2};CARD_target_drugs={3};CARD_resistance_mechanism={4}".format( - Best_Hit_ARO, AMR_Gene_Family, ARO, Drug_Class, Resistance_Mechanism) - - #print("Starting base on {0}".format(Start), file=sys.stderr) - - # convert strings of start and end to integers for calculations - iqend = int(Start) - iqstart = int(Stop) - # as start must always be less or equal to end, reverse them for opposite strand hits - if iqstart <= iqend: - strand = "+" - outlist = [Contig, "CARD_RGI", "resistance", Start, - Stop, Best_Hit_Bitscore , strand, ".", attributes] - plusstrand += 1 - else: - strand = "-" - outlist = [Contig, "CARD_RGI", "resistance", Start, - Stop, Best_Hit_Bitscore, strand, ".", attributes] - minusstrand += 1 - - writecounter += 1 - - # Print results - write_line(outlist, wayout) - - # Logs - print("Parsed %d lines" % (linecounter), time.asctime(), file=sys.stderr) - print("Found %d forward and %d reverse hits" % ( - plusstrand, minusstrand), time.asctime(), file=sys.stderr) - print("Wrote %d matches" % (writecounter), time.asctime(), file=sys.stderr) - - -if __name__ == "__main__": - main(sys.argv[1:], sys.stdout) diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..7b2d7ed --- /dev/null +++ b/setup.py @@ -0,0 +1,49 @@ +#!/usr/bin/env python3 +""" +This is the falmeida-py installation script. Assuming you're in the same directory, it can be run +like this: `python3 setup.py install`, or (probably better) like this: `pip3 install .` + +Copyright 2020 Felipe Almeida (almeidafmarques@gmail.com) +https://github.com/fmalmeida/pythonScripts + +This file is part of my custom python scripts (falmeida-py) package, which is free: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by the Free Software Foundation, +either version 3 of the License, or (at your option) any later version. This package is distributed +in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more +details. You should have received a copy of the GNU General Public License along with falmeida-py package. +If not, see . +""" + +from setuptools import setup + + +def readme(): + with open('README.md') as f: + return f.read() + + +# Get the program version from another file. +__version__ = '0.0.0' +exec(open('falmeida_py/version.py').read()) + +with open('requirements.txt') as f: + required = f.read().splitlines() + +setup( + name='falmeida-py', + version=__version__, + description='falmeida-py: a package to the simple distribution of my custom scripts.', + long_description=readme(), + long_description_content_type='text/markdown', + url='https://github.com/fmalmeida/pythonScripts', + author='Felipe Almeida', + author_email='almeidafmarques@gmail.com', + license='GPLv3', + packages=['falmeida_py'], + install_requires=required, + entry_points={"console_scripts": ['falmeida-py = falmeida_py.__main__:main']}, + include_package_data=True, + zip_safe=False, + python_requires='>=3.6' +)