First commit

8a007e86 · Hugo · 430db000 · 8a007e86 · 8a007e86
Commit 8a007e86 authored 5 years ago by Hugo
--- a/README.md
+++ b/README.md
 # Bib-analysis
 Simple python script to perform some analysis and visualization of your bibliography.
+
+---
+
+**Script by Hugo TESSIER**
+
+Beforehand, you need to prepare a simple text file containing, on each row, the ArXiv ID of each of the papers contained
+in your bibliography.
+Then pass the path of this file as an argument to this script and it will generate two main outputs :
+    
+- A PDF presenting a graph that summarize the citations between the papers of your bibliography and those they
+    cite themselves.
+    
+- A text file that shows the exact number of time each of these papers are cited by the one of your bibliography.
+
+These allow to determine, according to your bibliography, what are its most important papers as well as what may be
+the most important papers that are not included in your bibliography yet. As the data are queried with HTTP requests,
+the script needs an internet connection to work properly.
+
+On the graph, the blue nodes are the papers that belong to your bibliography. The red ones are the papers, cited by the
+blue ones, that are more cited than the most cited blue one : they supposedly are those not to miss. The orange ones
+are similar to the red ones, except that they are more cited than the average of the blue ones + the standard deviation
+of the citations of the blue ones : they are those which are cited an above-average number of time, hence they are
+likely to deserve attention. The grey ones are all the other papers.
\ No newline at end of file
--- a/bib_analysis.py
+++ b/bib_analysis.py
+"""
+Script by Hugo TESSIER
+
+Beforehand, you need to prepare a simple text file containing, on each row, the ArXiv ID of each of the papers contained
+in your bibliography.
+Then pass the path of this file as an argument to this script and it will generate two main outputs :
+    - A PDF presenting a graph that summarize the citations between the papers of your bibliography and those they
+    cite themselves.
+    - A text file that shows the exact number of time each of these papers are cited by the one of your bibliography.
+
+These allow to determine, according to your bibliography, what are its most important papers as well as what may be
+the most important papers that are not included in your bibliography yet. As the data are queried with HTTP requests,
+the script needs an internet connection to work properly.
+
+On the graph, the blue nodes are the papers that belong to your bibliography. The red ones are the papers, cited by the
+blue ones, that are more cited than the most cited blue one : they supposedly are those not to miss. The orange ones
+are similar to the red ones, except that they are more cited than the average of the blue ones + the standard deviation
+of the citations of the blue ones : they are those which are cited an above-average number of time, hence they are
+likely to deserve attention. The grey ones are all the other papers.
+"""
+import urllib3
+import json
+import xml.etree.ElementTree as ET
+import graphviz
+import math
+import numpy as np
+import argparse
+
+
+def parse_arguments():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("path",
+                        help="Path of the file containing the list of the Arxiv ID of the papers of your bibliography.",
+                        type=str)
+    args = parser.parse_args()
+    return args.path
+
+
+def get_my_bib(path):
+    with open(path, 'r') as f:
+        return [i.replace('\n', '') for i in f.readlines()]
+
+
+def get_paper_refs(paper_id):
+    url = f"https://www.prophy.science/api/arxiv/{paper_id}?include_unknown_references=1"
+    http = urllib3.PoolManager()
+    data = http.request('GET', url).data
+    data = json.loads(data)
+    references = [i['arxivId'] for i in data['references'] if i['arxivId'] is not None]
+    return references
+
+
+def get_paper_title(paper_id):
+    url = f"http://export.arxiv.org/api/query?search_query=id:{paper_id}"
+    http = urllib3.PoolManager()
+    data = http.request('GET', url).data
+    root = ET.fromstring(data)
+    for c in root:
+        if 'entry' in c.tag:
+            for child in c:
+                if 'title' in child.tag:
+                    return child.text
+
+
+def get_references_dict(bib):
+    refs = dict()
+    for paper in bib:
+        print(get_paper_title(paper))
+        paper_refs = get_paper_refs(paper)
+        refs[paper] = paper_refs
+    return refs
+
+
+def get_papers_list(refs):
+    papers_list = list()
+    for k, v in refs.items():
+        if k not in papers_list:
+            papers_list.append(k)
+        for paper in v:
+            if paper not in papers_list:
+                papers_list.append(paper)
+    return papers_list
+
+
+def get_references_matrix(papers_list, refs):
+    ref_matrix = np.zeros((len(papers_list), len(papers_list)))
+    for i, paper in enumerate(papers_list):
+        if paper in refs:
+            for r in refs[paper]:
+                j = papers_list.index(r)
+                ref_matrix[i, j] = 1
+    return ref_matrix
+
+
+def generate_pdf(title_list, papers_list, papers_score, bib_scores):
+    fontsize = 12
+    g = graphviz.Graph('Bib', filename='bib.gv', engine='fdp')
+    g.attr(splines='curved', concentrate='true', outputorder="edgesfirst", overlap="prism")
+    for i, (t, p) in enumerate(zip(title_list, papers_list)):
+        if p in refs:
+            color = 'blue'
+        else:
+            if papers_score[i] > np.max(bib_scores):
+                color = 'red'
+            elif float(papers_score[i]) > (np.mean(bib_scores) + np.std(bib_scores)) and papers_score[i] <= np.max(
+                    bib_scores):
+                color = 'orange'
+            else:
+                color = 'grey'
+
+        g.node(t, color=color, style='filled', fontsize=str(int(math.log(papers_score[i] + 2) * fontsize)))
+    for k, v in refs.items():
+        k_title = title_list[papers_list.index(k)]
+        for r in v:
+            r_title = title_list[papers_list.index(r)]
+            g.edge(k_title, r_title)
+    with g.subgraph(name='cluster_0') as c:
+        c.attr(shape='plaintext', label='Caption', fontsize='40', pencolor="transparent")
+        c.attr('node', shape='plaintext')
+        c.node('table',
+               label='<<TABLE BORDER="0"><TR><TD BGCOLOR="blue" WIDTH="100%"></TD><TD ALIGN="Left">Papers in the bibliography</TD></TR>'
+                     '<TR><TD BGCOLOR="red"></TD><TD ALIGN="Left">Other papers that are more cited than the most cited blue ones</TD></TR>'
+                     '<TR><TD BGCOLOR="orange"></TD><TD ALIGN="Left">Other papers that are more cited than the average of blue + std</TD></TR>'
+                     '<TR><TD BGCOLOR="grey"></TD><TD ALIGN="Left">Other papers</TD></TR><TR><TD BORDER="0" COLSPAN="2"><FONT POINT-SIZE="20">The size of each node is logarithmically proportional to the number of times it is cited by the blue ones.</FONT></TD></TR></TABLE>>',
+               fontsize='35')
+    g.view()
+
+
+def get_results_summary(title_list, papers_score):
+    indices = np.flip(np.argsort(papers_score))
+    papers_score = papers_score[indices]
+    title_list = [title_list[i] for i in indices]
+    with open('summary.txt', 'w') as f:
+        f.write(
+            'Here is the summary of how many times have each of these papers been cited by the papers of your bibliography : \n\n')
+        for i, t in enumerate(title_list):
+            f.write(f'{int(papers_score[i])}\t{title_list[i]}\n')
+
+
+if __name__ == '__main__':
+    path = parse_arguments()
+    bib = get_my_bib(path)
+    refs = get_references_dict(bib)
+    papers_list = get_papers_list(refs)
+    ref_matrix = get_references_matrix(papers_list, refs)
+    title_list = [get_paper_title(i).replace(' ', '_').replace(':', '') for i in papers_list]
+    papers_score = ref_matrix.sum(axis=0)
+    bib_scores = np.array(papers_score)[[(i in bib) for i in papers_list]]
+    generate_pdf(title_list, papers_list, papers_score, bib_scores)
+    get_results_summary(title_list, papers_score)
+    print("DONE")