@article{emse-2026-swh-topology,
  author = {Antoine Pietri and Guillaume Rousseau and Stefano Zacchiroli},
  title = {Determining the intrinsic structure of public software development history: an exploratory study},
  abstract = {Collaborative software development has produced a wealth of software source code artifacts (source files and directories, commits, releases, etc.) that have been studied for decades by researchers in empirical software engineering. Due to code reuse and the fork-based development model, those artifacts form a globally interconnected graph of a size comparable to the graph of the Web. Little is known yet about the network structure of this graph; such knowledge is useful to determine the best practical approaches to efficiently analyze very large subsets of it (if not all of it) in a methodologically sound manner. In this paper we determine the most salient network topology properties of the global public software development history as captured by state-of-the-art version control systems (VCS). As our corpus we use Software Heritage, one of the largest and most diverse publicly available archives of VCS data—encompassing 9 billion unique source code files and 2 billion unique commits coming from about 150 million projects or, as a graph, 19 billion nodes and 221 billion edges. We explore topology characteristics such as: degree distributions; distribution of connected component sizes; and distribution of shortest path lengths. We characterize these topology aspects for both the entire graph and relevant subgraphs.},
  publisher = {Springer},
  year = {2026},
  issn = {1382-3256},
  doi = {10.1007/s10664-025-10741-y},
  pages = {5},
  volume = {31},
  journal = {Empirical Software Engineering},
}
