Skip to content
Snippets Groups Projects
Commit 8a007e86 authored by Hugo's avatar Hugo
Browse files

First commit

parent 430db000
Branches
No related tags found
No related merge requests found
# Bib-analysis
Simple python script to perform some analysis and visualization of your bibliography.
---
**Script by Hugo TESSIER**
Beforehand, you need to prepare a simple text file containing, on each row, the ArXiv ID of each of the papers contained
in your bibliography.
Then pass the path of this file as an argument to this script and it will generate two main outputs :
- A PDF presenting a graph that summarize the citations between the papers of your bibliography and those they
cite themselves.
- A text file that shows the exact number of time each of these papers are cited by the one of your bibliography.
These allow to determine, according to your bibliography, what are its most important papers as well as what may be
the most important papers that are not included in your bibliography yet. As the data are queried with HTTP requests,
the script needs an internet connection to work properly.
On the graph, the blue nodes are the papers that belong to your bibliography. The red ones are the papers, cited by the
blue ones, that are more cited than the most cited blue one : they supposedly are those not to miss. The orange ones
are similar to the red ones, except that they are more cited than the average of the blue ones + the standard deviation
of the citations of the blue ones : they are those which are cited an above-average number of time, hence they are
likely to deserve attention. The grey ones are all the other papers.
\ No newline at end of file
"""
Script by Hugo TESSIER
Beforehand, you need to prepare a simple text file containing, on each row, the ArXiv ID of each of the papers contained
in your bibliography.
Then pass the path of this file as an argument to this script and it will generate two main outputs :
- A PDF presenting a graph that summarize the citations between the papers of your bibliography and those they
cite themselves.
- A text file that shows the exact number of time each of these papers are cited by the one of your bibliography.
These allow to determine, according to your bibliography, what are its most important papers as well as what may be
the most important papers that are not included in your bibliography yet. As the data are queried with HTTP requests,
the script needs an internet connection to work properly.
On the graph, the blue nodes are the papers that belong to your bibliography. The red ones are the papers, cited by the
blue ones, that are more cited than the most cited blue one : they supposedly are those not to miss. The orange ones
are similar to the red ones, except that they are more cited than the average of the blue ones + the standard deviation
of the citations of the blue ones : they are those which are cited an above-average number of time, hence they are
likely to deserve attention. The grey ones are all the other papers.
"""
import urllib3
import json
import xml.etree.ElementTree as ET
import graphviz
import math
import numpy as np
import argparse
def parse_arguments():
parser = argparse.ArgumentParser()
parser.add_argument("path",
help="Path of the file containing the list of the Arxiv ID of the papers of your bibliography.",
type=str)
args = parser.parse_args()
return args.path
def get_my_bib(path):
with open(path, 'r') as f:
return [i.replace('\n', '') for i in f.readlines()]
def get_paper_refs(paper_id):
url = f"https://www.prophy.science/api/arxiv/{paper_id}?include_unknown_references=1"
http = urllib3.PoolManager()
data = http.request('GET', url).data
data = json.loads(data)
references = [i['arxivId'] for i in data['references'] if i['arxivId'] is not None]
return references
def get_paper_title(paper_id):
url = f"http://export.arxiv.org/api/query?search_query=id:{paper_id}"
http = urllib3.PoolManager()
data = http.request('GET', url).data
root = ET.fromstring(data)
for c in root:
if 'entry' in c.tag:
for child in c:
if 'title' in child.tag:
return child.text
def get_references_dict(bib):
refs = dict()
for paper in bib:
print(get_paper_title(paper))
paper_refs = get_paper_refs(paper)
refs[paper] = paper_refs
return refs
def get_papers_list(refs):
papers_list = list()
for k, v in refs.items():
if k not in papers_list:
papers_list.append(k)
for paper in v:
if paper not in papers_list:
papers_list.append(paper)
return papers_list
def get_references_matrix(papers_list, refs):
ref_matrix = np.zeros((len(papers_list), len(papers_list)))
for i, paper in enumerate(papers_list):
if paper in refs:
for r in refs[paper]:
j = papers_list.index(r)
ref_matrix[i, j] = 1
return ref_matrix
def generate_pdf(title_list, papers_list, papers_score, bib_scores):
fontsize = 12
g = graphviz.Graph('Bib', filename='bib.gv', engine='fdp')
g.attr(splines='curved', concentrate='true', outputorder="edgesfirst", overlap="prism")
for i, (t, p) in enumerate(zip(title_list, papers_list)):
if p in refs:
color = 'blue'
else:
if papers_score[i] > np.max(bib_scores):
color = 'red'
elif float(papers_score[i]) > (np.mean(bib_scores) + np.std(bib_scores)) and papers_score[i] <= np.max(
bib_scores):
color = 'orange'
else:
color = 'grey'
g.node(t, color=color, style='filled', fontsize=str(int(math.log(papers_score[i] + 2) * fontsize)))
for k, v in refs.items():
k_title = title_list[papers_list.index(k)]
for r in v:
r_title = title_list[papers_list.index(r)]
g.edge(k_title, r_title)
with g.subgraph(name='cluster_0') as c:
c.attr(shape='plaintext', label='Caption', fontsize='40', pencolor="transparent")
c.attr('node', shape='plaintext')
c.node('table',
label='<<TABLE BORDER="0"><TR><TD BGCOLOR="blue" WIDTH="100%"></TD><TD ALIGN="Left">Papers in the bibliography</TD></TR>'
'<TR><TD BGCOLOR="red"></TD><TD ALIGN="Left">Other papers that are more cited than the most cited blue ones</TD></TR>'
'<TR><TD BGCOLOR="orange"></TD><TD ALIGN="Left">Other papers that are more cited than the average of blue + std</TD></TR>'
'<TR><TD BGCOLOR="grey"></TD><TD ALIGN="Left">Other papers</TD></TR><TR><TD BORDER="0" COLSPAN="2"><FONT POINT-SIZE="20">The size of each node is logarithmically proportional to the number of times it is cited by the blue ones.</FONT></TD></TR></TABLE>>',
fontsize='35')
g.view()
def get_results_summary(title_list, papers_score):
indices = np.flip(np.argsort(papers_score))
papers_score = papers_score[indices]
title_list = [title_list[i] for i in indices]
with open('summary.txt', 'w') as f:
f.write(
'Here is the summary of how many times have each of these papers been cited by the papers of your bibliography : \n\n')
for i, t in enumerate(title_list):
f.write(f'{int(papers_score[i])}\t{title_list[i]}\n')
if __name__ == '__main__':
path = parse_arguments()
bib = get_my_bib(path)
refs = get_references_dict(bib)
papers_list = get_papers_list(refs)
ref_matrix = get_references_matrix(papers_list, refs)
title_list = [get_paper_title(i).replace(' ', '_').replace(':', '') for i in papers_list]
papers_score = ref_matrix.sum(axis=0)
bib_scores = np.array(papers_score)[[(i in bib) for i in papers_list]]
generate_pdf(title_list, papers_list, papers_score, bib_scores)
get_results_summary(title_list, papers_score)
print("DONE")
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment